Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 7270, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-28-092409-31635-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SetVector.h"
73#include "llvm/ADT/SmallPtrSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpressions.h"
95#include "llvm/Analysis/TargetLibraryInfo.h"
96#include "llvm/Analysis/TargetTransformInfo.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/LLVMContext.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/InitializePasses.h"
127#include "llvm/Pass.h"
128#include "llvm/Support/Casting.h"
129#include "llvm/Support/CommandLine.h"
130#include "llvm/Support/Compiler.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160/// @{
161/// Metadata attribute names
162static const char *const LLVMLoopVectorizeFollowupAll =
163 "llvm.loop.vectorize.followup_all";
164static const char *const LLVMLoopVectorizeFollowupVectorized =
165 "llvm.loop.vectorize.followup_vectorized";
166static const char *const LLVMLoopVectorizeFollowupEpilogue =
167 "llvm.loop.vectorize.followup_epilogue";
168/// @}
169
170STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
171STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
172
173/// Loops with a known constant trip count below this number are vectorized only
174/// if no scalar iteration overheads are incurred.
175static cl::opt<unsigned> TinyTripCountVectorThreshold(
176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177 cl::desc("Loops with a constant trip count that is smaller than this "
178 "value are vectorized only if no scalar iteration overheads "
179 "are incurred."));
180
181// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182// that predication is preferred, and this lists all options. I.e., the
183// vectorizer will try to fold the tail-loop (epilogue) into the vector body
184// and predicate the instructions accordingly. If tail-folding fails, there are
185// different fallback strategies depending on these values:
186namespace PreferPredicateTy {
187 enum Option {
188 ScalarEpilogue = 0,
189 PredicateElseScalarEpilogue,
190 PredicateOrDontVectorize
191 };
192} // namespace PreferPredicateTy
193
194static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195 "prefer-predicate-over-epilogue",
196 cl::init(PreferPredicateTy::ScalarEpilogue),
197 cl::Hidden,
198 cl::desc("Tail-folding and predication preferences over creating a scalar "
199 "epilogue loop."),
200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
201 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
202 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
204 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
205 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
206 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
208 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
209 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
210 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
211
212static cl::opt<bool> MaximizeBandwidth(
213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214 cl::desc("Maximize bandwidth when selecting vectorization factor which "
215 "will be determined by the smallest type in loop."));
216
217static cl::opt<bool> EnableInterleavedMemAccesses(
218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220
221/// An interleave-group may need masking if it resides in a block that needs
222/// predication, or in order to mask away gaps.
223static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226
227static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229 cl::desc("We don't interleave loops with a estimated constant trip count "
230 "below this number"));
231
232static cl::opt<unsigned> ForceTargetNumScalarRegs(
233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234 cl::desc("A flag that overrides the target's number of scalar registers."));
235
236static cl::opt<unsigned> ForceTargetNumVectorRegs(
237 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238 cl::desc("A flag that overrides the target's number of vector registers."));
239
240static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242 cl::desc("A flag that overrides the target's max interleave factor for "
243 "scalar loops."));
244
245static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247 cl::desc("A flag that overrides the target's max interleave factor for "
248 "vectorized loops."));
249
250static cl::opt<unsigned> ForceTargetInstructionCost(
251 "force-target-instruction-cost", cl::init(0), cl::Hidden,
252 cl::desc("A flag that overrides the target's expected cost for "
253 "an instruction to a single constant value. Mostly "
254 "useful for getting consistent testing."));
255
256static cl::opt<unsigned> SmallLoopCost(
257 "small-loop-cost", cl::init(20), cl::Hidden,
258 cl::desc(
259 "The cost of a loop that is considered 'small' by the interleaver."));
260
261static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263 cl::desc("Enable the use of the block frequency analysis to access PGO "
264 "heuristics minimizing code growth in cold regions and being more "
265 "aggressive in hot regions."));
266
267// Runtime interleave loops for load/store throughput.
268static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270 cl::desc(
271 "Enable runtime interleaving until load/store ports are saturated"));
272
273/// Interleave small loops with scalar reductions.
274static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276 cl::desc("Enable interleaving for loops with small iteration counts that "
277 "contain scalar reductions to expose ILP."));
278
279/// The number of stores in a loop that are allowed to need predication.
280static cl::opt<unsigned> NumberOfStoresToPredicate(
281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282 cl::desc("Max number of stores to be predicated behind an if."));
283
284static cl::opt<bool> EnableIndVarRegisterHeur(
285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286 cl::desc("Count the induction variable only once when interleaving"));
287
288static cl::opt<bool> EnableCondStoresVectorization(
289 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290 cl::desc("Enable if predication of stores during vectorization."));
291
292static cl::opt<unsigned> MaxNestedScalarReductionIC(
293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294 cl::desc("The maximum interleave count to use when interleaving a scalar "
295 "reduction in a nested loop."));
296
297static cl::opt<bool>
298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299 cl::Hidden,
300 cl::desc("Prefer in-loop vector reductions, "
301 "overriding the targets preference."));
302
303static cl::opt<bool> PreferPredicatedReductionSelect(
304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305 cl::desc(
306 "Prefer predicating a reduction operation over an after loop select."));
307
308cl::opt<bool> EnableVPlanNativePath(
309 "enable-vplan-native-path", cl::init(false), cl::Hidden,
310 cl::desc("Enable VPlan-native vectorization path with "
311 "support for outer loop vectorization."));
312
313// FIXME: Remove this switch once we have divergence analysis. Currently we
314// assume divergent non-backedge branches when this switch is true.
315cl::opt<bool> EnableVPlanPredication(
316 "enable-vplan-predication", cl::init(false), cl::Hidden,
317 cl::desc("Enable VPlan-native vectorization path predicator with "
318 "support for outer loop vectorization."));
319
320// This flag enables the stress testing of the VPlan H-CFG construction in the
321// VPlan-native vectorization path. It must be used in conjuction with
322// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323// verification of the H-CFGs built.
324static cl::opt<bool> VPlanBuildStressTest(
325 "vplan-build-stress-test", cl::init(false), cl::Hidden,
326 cl::desc(
327 "Build VPlan for every supported loop nest in the function and bail "
328 "out right after the build (stress test the VPlan H-CFG construction "
329 "in the VPlan-native vectorization path)."));
330
331cl::opt<bool> llvm::EnableLoopInterleaving(
332 "interleave-loops", cl::init(true), cl::Hidden,
333 cl::desc("Enable loop interleaving in Loop vectorization passes"));
334cl::opt<bool> llvm::EnableLoopVectorization(
335 "vectorize-loops", cl::init(true), cl::Hidden,
336 cl::desc("Run the Loop vectorization passes"));
337
338/// A helper function that returns the type of loaded or stored value.
339static Type *getMemInstValueType(Value *I) {
340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 341, __PRETTY_FUNCTION__))
341 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 341, __PRETTY_FUNCTION__))
;
342 if (auto *LI = dyn_cast<LoadInst>(I))
343 return LI->getType();
344 return cast<StoreInst>(I)->getValueOperand()->getType();
345}
346
347/// A helper function that returns true if the given type is irregular. The
348/// type is irregular if its allocated size doesn't equal the store size of an
349/// element of the corresponding vector type at the given vectorization factor.
350static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 351, __PRETTY_FUNCTION__))
;
352 // Determine if an array of VF elements of type Ty is "bitcast compatible"
353 // with a <VF x Ty> vector.
354 if (VF.isVector()) {
355 auto *VectorTy = VectorType::get(Ty, VF);
356 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
357 }
358
359 // If the vectorization factor is one, we just check if an array of type Ty
360 // requires padding between elements.
361 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
362}
363
364/// A helper function that returns the reciprocal of the block probability of
365/// predicated blocks. If we return X, we are assuming the predicated block
366/// will execute once for every X iterations of the loop header.
367///
368/// TODO: We should use actual block probability here, if available. Currently,
369/// we always assume predicated blocks have a 50% chance of executing.
370static unsigned getReciprocalPredBlockProb() { return 2; }
371
372/// A helper function that adds a 'fast' flag to floating-point operations.
373static Value *addFastMathFlag(Value *V) {
374 if (isa<FPMathOperator>(V))
375 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
376 return V;
377}
378
379static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
380 if (isa<FPMathOperator>(V))
381 cast<Instruction>(V)->setFastMathFlags(FMF);
382 return V;
383}
384
385/// A helper function that returns an integer or floating-point constant with
386/// value C.
387static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
388 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
389 : ConstantFP::get(Ty, C);
390}
391
392/// Returns "best known" trip count for the specified loop \p L as defined by
393/// the following procedure:
394/// 1) Returns exact trip count if it is known.
395/// 2) Returns expected trip count according to profile data if any.
396/// 3) Returns upper bound estimate if it is known.
397/// 4) Returns None if all of the above failed.
398static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
399 // Check if exact trip count is known.
400 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
401 return ExpectedTC;
402
403 // Check if there is an expected trip count available from profile data.
404 if (LoopVectorizeWithBlockFrequency)
405 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
406 return EstimatedTC;
407
408 // Check if upper bound estimate is known.
409 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
410 return ExpectedTC;
411
412 return None;
413}
414
415namespace llvm {
416
417/// InnerLoopVectorizer vectorizes loops which contain only one basic
418/// block to a specified vectorization factor (VF).
419/// This class performs the widening of scalars into vectors, or multiple
420/// scalars. This class also implements the following features:
421/// * It inserts an epilogue loop for handling loops that don't have iteration
422/// counts that are known to be a multiple of the vectorization factor.
423/// * It handles the code generation for reduction variables.
424/// * Scalarization (implementation using scalars) of un-vectorizable
425/// instructions.
426/// InnerLoopVectorizer does not perform any vectorization-legality
427/// checks, and relies on the caller to check for the different legality
428/// aspects. The InnerLoopVectorizer relies on the
429/// LoopVectorizationLegality class to provide information about the induction
430/// and reduction variables that were found to a given vectorization factor.
431class InnerLoopVectorizer {
432public:
433 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
434 LoopInfo *LI, DominatorTree *DT,
435 const TargetLibraryInfo *TLI,
436 const TargetTransformInfo *TTI, AssumptionCache *AC,
437 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
438 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
439 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
440 ProfileSummaryInfo *PSI)
441 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
442 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
443 Builder(PSE.getSE()->getContext()),
444 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
445 BFI(BFI), PSI(PSI) {
446 // Query this against the original loop and save it here because the profile
447 // of the original loop header may change as the transformation happens.
448 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
449 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
450 }
451
452 virtual ~InnerLoopVectorizer() = default;
453
454 /// Create a new empty loop that will contain vectorized instructions later
455 /// on, while the old loop will be used as the scalar remainder. Control flow
456 /// is generated around the vectorized (and scalar epilogue) loops consisting
457 /// of various checks and bypasses. Return the pre-header block of the new
458 /// loop.
459 BasicBlock *createVectorizedLoopSkeleton();
460
461 /// Widen a single instruction within the innermost loop.
462 void widenInstruction(Instruction &I, VPUser &Operands,
463 VPTransformState &State);
464
465 /// Widen a single call instruction within the innermost loop.
466 void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
467 VPTransformState &State);
468
469 /// Widen a single select instruction within the innermost loop.
470 void widenSelectInstruction(SelectInst &I, VPUser &Operands,
471 bool InvariantCond, VPTransformState &State);
472
473 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
474 void fixVectorizedLoop();
475
476 // Return true if any runtime check is added.
477 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
478
479 /// A type for vectorized values in the new loop. Each value from the
480 /// original loop, when vectorized, is represented by UF vector values in the
481 /// new unrolled loop, where UF is the unroll factor.
482 using VectorParts = SmallVector<Value *, 2>;
483
484 /// Vectorize a single GetElementPtrInst based on information gathered and
485 /// decisions taken during planning.
486 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
487 ElementCount VF, bool IsPtrLoopInvariant,
488 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
489
490 /// Vectorize a single PHINode in a block. This method handles the induction
491 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
492 /// arbitrary length vectors.
493 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
494
495 /// A helper function to scalarize a single Instruction in the innermost loop.
496 /// Generates a sequence of scalar instances for each lane between \p MinLane
497 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
498 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
499 /// Instr's operands.
500 void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
501 const VPIteration &Instance, bool IfPredicateInstr,
502 VPTransformState &State);
503
504 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
505 /// is provided, the integer induction variable will first be truncated to
506 /// the corresponding type.
507 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
508
509 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
510 /// vector or scalar value on-demand if one is not yet available. When
511 /// vectorizing a loop, we visit the definition of an instruction before its
512 /// uses. When visiting the definition, we either vectorize or scalarize the
513 /// instruction, creating an entry for it in the corresponding map. (In some
514 /// cases, such as induction variables, we will create both vector and scalar
515 /// entries.) Then, as we encounter uses of the definition, we derive values
516 /// for each scalar or vector use unless such a value is already available.
517 /// For example, if we scalarize a definition and one of its uses is vector,
518 /// we build the required vector on-demand with an insertelement sequence
519 /// when visiting the use. Otherwise, if the use is scalar, we can use the
520 /// existing scalar definition.
521 ///
522 /// Return a value in the new loop corresponding to \p V from the original
523 /// loop at unroll index \p Part. If the value has already been vectorized,
524 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
525 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
526 /// a new vector value on-demand by inserting the scalar values into a vector
527 /// with an insertelement sequence. If the value has been neither vectorized
528 /// nor scalarized, it must be loop invariant, so we simply broadcast the
529 /// value into a vector.
530 Value *getOrCreateVectorValue(Value *V, unsigned Part);
531
532 /// Return a value in the new loop corresponding to \p V from the original
533 /// loop at unroll and vector indices \p Instance. If the value has been
534 /// vectorized but not scalarized, the necessary extractelement instruction
535 /// will be generated.
536 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
537
538 /// Construct the vector value of a scalarized value \p V one lane at a time.
539 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
540
541 /// Try to vectorize interleaved access group \p Group with the base address
542 /// given in \p Addr, optionally masking the vector operations if \p
543 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
544 /// values in the vectorized loop.
545 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
546 VPTransformState &State, VPValue *Addr,
547 VPValue *BlockInMask = nullptr);
548
549 /// Vectorize Load and Store instructions with the base address given in \p
550 /// Addr, optionally masking the vector operations if \p BlockInMask is
551 /// non-null. Use \p State to translate given VPValues to IR values in the
552 /// vectorized loop.
553 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
554 VPValue *Addr, VPValue *StoredValue,
555 VPValue *BlockInMask);
556
557 /// Set the debug location in the builder using the debug location in
558 /// the instruction.
559 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
560
561 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
562 void fixNonInductionPHIs(void);
563
564protected:
565 friend class LoopVectorizationPlanner;
566
567 /// A small list of PHINodes.
568 using PhiVector = SmallVector<PHINode *, 4>;
569
570 /// A type for scalarized values in the new loop. Each value from the
571 /// original loop, when scalarized, is represented by UF x VF scalar values
572 /// in the new unrolled loop, where UF is the unroll factor and VF is the
573 /// vectorization factor.
574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575
576 /// Set up the values of the IVs correctly when exiting the vector loop.
577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578 Value *CountRoundDown, Value *EndValue,
579 BasicBlock *MiddleBlock);
580
581 /// Create a new induction variable inside L.
582 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
583 Value *Step, Instruction *DL);
584
585 /// Handle all cross-iteration phis in the header.
586 void fixCrossIterationPHIs();
587
588 /// Fix a first-order recurrence. This is the second phase of vectorizing
589 /// this phi node.
590 void fixFirstOrderRecurrence(PHINode *Phi);
591
592 /// Fix a reduction cross-iteration phi. This is the second phase of
593 /// vectorizing this phi node.
594 void fixReduction(PHINode *Phi);
595
596 /// Clear NSW/NUW flags from reduction instructions if necessary.
597 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
598
599 /// The Loop exit block may have single value PHI nodes with some
600 /// incoming value. While vectorizing we only handled real values
601 /// that were defined inside the loop and we should have one value for
602 /// each predecessor of its parent basic block. See PR14725.
603 void fixLCSSAPHIs();
604
605 /// Iteratively sink the scalarized operands of a predicated instruction into
606 /// the block that was created for it.
607 void sinkScalarOperands(Instruction *PredInst);
608
609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
610 /// represented as.
611 void truncateToMinimalBitwidths();
612
613 /// Create a broadcast instruction. This method generates a broadcast
614 /// instruction (shuffle) for loop invariant values and for the induction
615 /// value. If this is the induction variable then we extend it to N, N+1, ...
616 /// this is needed because each iteration in the loop corresponds to a SIMD
617 /// element.
618 virtual Value *getBroadcastInstrs(Value *V);
619
620 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
621 /// to each vector element of Val. The sequence starts at StartIndex.
622 /// \p Opcode is relevant for FP induction variable.
623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624 Instruction::BinaryOps Opcode =
625 Instruction::BinaryOpsEnd);
626
627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628 /// variable on which to base the steps, \p Step is the size of the step, and
629 /// \p EntryVal is the value from the original loop that maps to the steps.
630 /// Note that \p EntryVal doesn't have to be an induction variable - it
631 /// can also be a truncate instruction.
632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633 const InductionDescriptor &ID);
634
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641 Value *Step, Instruction *EntryVal);
642
643 /// Returns true if an instruction \p I should be scalarized instead of
644 /// vectorized for the chosen vectorization factor.
645 bool shouldScalarizeInstruction(Instruction *I) const;
646
647 /// Returns true if we should generate a scalar version of \p IV.
648 bool needsScalarInduction(Instruction *IV) const;
649
650 /// If there is a cast involved in the induction variable \p ID, which should
651 /// be ignored in the vectorized loop body, this function records the
652 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
653 /// cast. We had already proved that the casted Phi is equal to the uncasted
654 /// Phi in the vectorized loop (under a runtime guard), and therefore
655 /// there is no need to vectorize the cast - the same value can be used in the
656 /// vector loop for both the Phi and the cast.
657 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
658 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
659 ///
660 /// \p EntryVal is the value from the original loop that maps to the vector
661 /// phi node and is used to distinguish what is the IV currently being
662 /// processed - original one (if \p EntryVal is a phi corresponding to the
663 /// original IV) or the "newly-created" one based on the proof mentioned above
664 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
665 /// latter case \p EntryVal is a TruncInst and we must not record anything for
666 /// that IV, but it's error-prone to expect callers of this routine to care
667 /// about that, hence this explicit parameter.
668 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
669 const Instruction *EntryVal,
670 Value *VectorLoopValue,
671 unsigned Part,
672 unsigned Lane = UINT_MAX(2147483647 *2U +1U));
673
674 /// Generate a shuffle sequence that will reverse the vector Vec.
675 virtual Value *reverseVector(Value *Vec);
676
677 /// Returns (and creates if needed) the original loop trip count.
678 Value *getOrCreateTripCount(Loop *NewLoop);
679
680 /// Returns (and creates if needed) the trip count of the widened loop.
681 Value *getOrCreateVectorTripCount(Loop *NewLoop);
682
683 /// Returns a bitcasted value to the requested vector type.
684 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
685 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
686 const DataLayout &DL);
687
688 /// Emit a bypass check to see if the vector trip count is zero, including if
689 /// it overflows.
690 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
691
692 /// Emit a bypass check to see if all of the SCEV assumptions we've
693 /// had to make are correct.
694 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
695
696 /// Emit bypass checks to check any memory assumptions we may have made.
697 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
698
699 /// Compute the transformed value of Index at offset StartValue using step
700 /// StepValue.
701 /// For integer induction, returns StartValue + Index * StepValue.
702 /// For pointer induction, returns StartValue[Index * StepValue].
703 /// FIXME: The newly created binary instructions should contain nsw/nuw
704 /// flags, which can be found from the original scalar operations.
705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
706 const DataLayout &DL,
707 const InductionDescriptor &ID) const;
708
709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
710 /// vector loop preheader, middle block and scalar preheader. Also
711 /// allocate a loop object for the new vector loop and return it.
712 Loop *createVectorLoopSkeleton(StringRef Prefix);
713
714 /// Create new phi nodes for the induction variables to resume iteration count
715 /// in the scalar epilogue, from where the vectorized loop left off (given by
716 /// \p VectorTripCount).
717 void createInductionResumeValues(Loop *L, Value *VectorTripCount);
718
719 /// Complete the loop skeleton by adding debug MDs, creating appropriate
720 /// conditional branches in the middle block, preparing the builder and
721 /// running the verifier. Take in the vector loop \p L as argument, and return
722 /// the preheader of the completed vector loop.
723 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
724
725 /// Add additional metadata to \p To that was not present on \p Orig.
726 ///
727 /// Currently this is used to add the noalias annotations based on the
728 /// inserted memchecks. Use this for instructions that are *cloned* into the
729 /// vector loop.
730 void addNewMetadata(Instruction *To, const Instruction *Orig);
731
732 /// Add metadata from one instruction to another.
733 ///
734 /// This includes both the original MDs from \p From and additional ones (\see
735 /// addNewMetadata). Use this for *newly created* instructions in the vector
736 /// loop.
737 void addMetadata(Instruction *To, Instruction *From);
738
739 /// Similar to the previous function but it adds the metadata to a
740 /// vector of instructions.
741 void addMetadata(ArrayRef<Value *> To, Instruction *From);
742
743 /// The original loop.
744 Loop *OrigLoop;
745
746 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
747 /// dynamic knowledge to simplify SCEV expressions and converts them to a
748 /// more usable form.
749 PredicatedScalarEvolution &PSE;
750
751 /// Loop Info.
752 LoopInfo *LI;
753
754 /// Dominator Tree.
755 DominatorTree *DT;
756
757 /// Alias Analysis.
758 AAResults *AA;
759
760 /// Target Library Info.
761 const TargetLibraryInfo *TLI;
762
763 /// Target Transform Info.
764 const TargetTransformInfo *TTI;
765
766 /// Assumption Cache.
767 AssumptionCache *AC;
768
769 /// Interface to emit optimization remarks.
770 OptimizationRemarkEmitter *ORE;
771
772 /// LoopVersioning. It's only set up (non-null) if memchecks were
773 /// used.
774 ///
775 /// This is currently only used to add no-alias metadata based on the
776 /// memchecks. The actually versioning is performed manually.
777 std::unique_ptr<LoopVersioning> LVer;
778
779 /// The vectorization SIMD factor to use. Each vector will have this many
780 /// vector elements.
781 ElementCount VF;
782
783 /// The vectorization unroll factor to use. Each scalar is vectorized to this
784 /// many different vector instructions.
785 unsigned UF;
786
787 /// The builder that we use
788 IRBuilder<> Builder;
789
790 // --- Vectorization state ---
791
792 /// The vector-loop preheader.
793 BasicBlock *LoopVectorPreHeader;
794
795 /// The scalar-loop preheader.
796 BasicBlock *LoopScalarPreHeader;
797
798 /// Middle Block between the vector and the scalar.
799 BasicBlock *LoopMiddleBlock;
800
801 /// The ExitBlock of the scalar loop.
802 BasicBlock *LoopExitBlock;
803
804 /// The vector loop body.
805 BasicBlock *LoopVectorBody;
806
807 /// The scalar loop body.
808 BasicBlock *LoopScalarBody;
809
810 /// A list of all bypass blocks. The first block is the entry of the loop.
811 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
812
813 /// The new Induction variable which was added to the new block.
814 PHINode *Induction = nullptr;
815
816 /// The induction variable of the old basic block.
817 PHINode *OldInduction = nullptr;
818
819 /// Maps values from the original loop to their corresponding values in the
820 /// vectorized loop. A key value can map to either vector values, scalar
821 /// values or both kinds of values, depending on whether the key was
822 /// vectorized and scalarized.
823 VectorizerValueMap VectorLoopValueMap;
824
825 /// Store instructions that were predicated.
826 SmallVector<Instruction *, 4> PredicatedInstructions;
827
828 /// Trip count of the original loop.
829 Value *TripCount = nullptr;
830
831 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
832 Value *VectorTripCount = nullptr;
833
834 /// The legality analysis.
835 LoopVectorizationLegality *Legal;
836
837 /// The profitablity analysis.
838 LoopVectorizationCostModel *Cost;
839
840 // Record whether runtime checks are added.
841 bool AddedSafetyChecks = false;
842
843 // Holds the end values for each induction variable. We save the end values
844 // so we can later fix-up the external users of the induction variables.
845 DenseMap<PHINode *, Value *> IVEndValues;
846
847 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
848 // fixed up at the end of vector code generation.
849 SmallVector<PHINode *, 8> OrigPHIsToFix;
850
851 /// BFI and PSI are used to check for profile guided size optimizations.
852 BlockFrequencyInfo *BFI;
853 ProfileSummaryInfo *PSI;
854
855 // Whether this loop should be optimized for size based on profile guided size
856 // optimizatios.
857 bool OptForSizeBasedOnProfile;
858};
859
860class InnerLoopUnroller : public InnerLoopVectorizer {
861public:
862 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
863 LoopInfo *LI, DominatorTree *DT,
864 const TargetLibraryInfo *TLI,
865 const TargetTransformInfo *TTI, AssumptionCache *AC,
866 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
867 LoopVectorizationLegality *LVL,
868 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
869 ProfileSummaryInfo *PSI)
870 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
871 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
872 BFI, PSI) {}
873
874private:
875 Value *getBroadcastInstrs(Value *V) override;
876 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
877 Instruction::BinaryOps Opcode =
878 Instruction::BinaryOpsEnd) override;
879 Value *reverseVector(Value *Vec) override;
880};
881
882} // end namespace llvm
883
884/// Look for a meaningful debug location on the instruction or it's
885/// operands.
886static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
887 if (!I)
888 return I;
889
890 DebugLoc Empty;
891 if (I->getDebugLoc() != Empty)
892 return I;
893
894 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
895 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
896 if (OpInst->getDebugLoc() != Empty)
897 return OpInst;
898 }
899
900 return I;
901}
902
903void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
904 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
905 const DILocation *DIL = Inst->getDebugLoc();
906 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
907 !isa<DbgInfoIntrinsic>(Inst)) {
908 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 908, __PRETTY_FUNCTION__))
;
909 auto NewDIL =
910 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
911 if (NewDIL)
912 B.SetCurrentDebugLocation(NewDIL.getValue());
913 else
914 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
915 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
916 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
917 }
918 else
919 B.SetCurrentDebugLocation(DIL);
920 } else
921 B.SetCurrentDebugLocation(DebugLoc());
922}
923
924/// Write a record \p DebugMsg about vectorization failure to the debug
925/// output stream. If \p I is passed, it is an instruction that prevents
926/// vectorization.
927#ifndef NDEBUG
928static void debugVectorizationFailure(const StringRef DebugMsg,
929 Instruction *I) {
930 dbgs() << "LV: Not vectorizing: " << DebugMsg;
931 if (I != nullptr)
932 dbgs() << " " << *I;
933 else
934 dbgs() << '.';
935 dbgs() << '\n';
936}
937#endif
938
939/// Create an analysis remark that explains why vectorization failed
940///
941/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
942/// RemarkName is the identifier for the remark. If \p I is passed it is an
943/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
944/// the location of the remark. \return the remark object that can be
945/// streamed to.
946static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
947 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
948 Value *CodeRegion = TheLoop->getHeader();
949 DebugLoc DL = TheLoop->getStartLoc();
950
951 if (I) {
952 CodeRegion = I->getParent();
953 // If there is no debug location attached to the instruction, revert back to
954 // using the loop's.
955 if (I->getDebugLoc())
956 DL = I->getDebugLoc();
957 }
958
959 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
960 R << "loop not vectorized: ";
961 return R;
962}
963
964namespace llvm {
965
966void reportVectorizationFailure(const StringRef DebugMsg,
967 const StringRef OREMsg, const StringRef ORETag,
968 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
969 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
970 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
971 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
972 ORETag, TheLoop, I) << OREMsg);
973}
974
975} // end namespace llvm
976
977#ifndef NDEBUG
978/// \return string containing a file name and a line # for the given loop.
979static std::string getDebugLocString(const Loop *L) {
980 std::string Result;
981 if (L) {
982 raw_string_ostream OS(Result);
983 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
984 LoopDbgLoc.print(OS);
985 else
986 // Just print the module name.
987 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
988 OS.flush();
989 }
990 return Result;
991}
992#endif
993
994void InnerLoopVectorizer::addNewMetadata(Instruction *To,
995 const Instruction *Orig) {
996 // If the loop was versioned with memchecks, add the corresponding no-alias
997 // metadata.
998 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
999 LVer->annotateInstWithNoAlias(To, Orig);
1000}
1001
1002void InnerLoopVectorizer::addMetadata(Instruction *To,
1003 Instruction *From) {
1004 propagateMetadata(To, From);
1005 addNewMetadata(To, From);
1006}
1007
1008void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1009 Instruction *From) {
1010 for (Value *V : To) {
1011 if (Instruction *I = dyn_cast<Instruction>(V))
1012 addMetadata(I, From);
1013 }
1014}
1015
1016namespace llvm {
1017
1018// Loop vectorization cost-model hints how the scalar epilogue loop should be
1019// lowered.
1020enum ScalarEpilogueLowering {
1021
1022 // The default: allowing scalar epilogues.
1023 CM_ScalarEpilogueAllowed,
1024
1025 // Vectorization with OptForSize: don't allow epilogues.
1026 CM_ScalarEpilogueNotAllowedOptSize,
1027
1028 // A special case of vectorisation with OptForSize: loops with a very small
1029 // trip count are considered for vectorization under OptForSize, thereby
1030 // making sure the cost of their loop body is dominant, free of runtime
1031 // guards and scalar iteration overheads.
1032 CM_ScalarEpilogueNotAllowedLowTripLoop,
1033
1034 // Loop hint predicate indicating an epilogue is undesired.
1035 CM_ScalarEpilogueNotNeededUsePredicate
1036};
1037
1038/// LoopVectorizationCostModel - estimates the expected speedups due to
1039/// vectorization.
1040/// In many cases vectorization is not profitable. This can happen because of
1041/// a number of reasons. In this class we mainly attempt to predict the
1042/// expected speedup/slowdowns due to the supported instruction set. We use the
1043/// TargetTransformInfo to query the different backends for the cost of
1044/// different operations.
1045class LoopVectorizationCostModel {
1046public:
1047 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1048 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1049 LoopVectorizationLegality *Legal,
1050 const TargetTransformInfo &TTI,
1051 const TargetLibraryInfo *TLI, DemandedBits *DB,
1052 AssumptionCache *AC,
1053 OptimizationRemarkEmitter *ORE, const Function *F,
1054 const LoopVectorizeHints *Hints,
1055 InterleavedAccessInfo &IAI)
1056 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1057 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1058 Hints(Hints), InterleaveInfo(IAI) {}
1059
1060 /// \return An upper bound for the vectorization factor, or None if
1061 /// vectorization and interleaving should be avoided up front.
1062 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1063
1064 /// \return True if runtime checks are required for vectorization, and false
1065 /// otherwise.
1066 bool runtimeChecksRequired();
1067
1068 /// \return The most profitable vectorization factor and the cost of that VF.
1069 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1070 /// then this vectorization factor will be selected if vectorization is
1071 /// possible.
1072 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1073
1074 /// Setup cost-based decisions for user vectorization factor.
1075 void selectUserVectorizationFactor(ElementCount UserVF) {
1076 collectUniformsAndScalars(UserVF);
1077 collectInstsToScalarize(UserVF);
1078 }
1079
1080 /// \return The size (in bits) of the smallest and widest types in the code
1081 /// that needs to be vectorized. We ignore values that remain scalar such as
1082 /// 64 bit loop indices.
1083 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1084
1085 /// \return The desired interleave count.
1086 /// If interleave count has been specified by metadata it will be returned.
1087 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1088 /// are the selected vectorization factor and the cost of the selected VF.
1089 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1090
1091 /// Memory access instruction may be vectorized in more than one way.
1092 /// Form of instruction after vectorization depends on cost.
1093 /// This function takes cost-based decisions for Load/Store instructions
1094 /// and collects them in a map. This decisions map is used for building
1095 /// the lists of loop-uniform and loop-scalar instructions.
1096 /// The calculated cost is saved with widening decision in order to
1097 /// avoid redundant calculations.
1098 void setCostBasedWideningDecision(ElementCount VF);
1099
1100 /// A struct that represents some properties of the register usage
1101 /// of a loop.
1102 struct RegisterUsage {
1103 /// Holds the number of loop invariant values that are used in the loop.
1104 /// The key is ClassID of target-provided register class.
1105 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1106 /// Holds the maximum number of concurrent live intervals in the loop.
1107 /// The key is ClassID of target-provided register class.
1108 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1109 };
1110
1111 /// \return Returns information about the register usages of the loop for the
1112 /// given vectorization factors.
1113 SmallVector<RegisterUsage, 8>
1114 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1115
1116 /// Collect values we want to ignore in the cost model.
1117 void collectValuesToIgnore();
1118
1119 /// Split reductions into those that happen in the loop, and those that happen
1120 /// outside. In loop reductions are collected into InLoopReductionChains.
1121 void collectInLoopReductions();
1122
1123 /// \returns The smallest bitwidth each instruction can be represented with.
1124 /// The vector equivalents of these instructions should be truncated to this
1125 /// type.
1126 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1127 return MinBWs;
1128 }
1129
1130 /// \returns True if it is more profitable to scalarize instruction \p I for
1131 /// vectorization factor \p VF.
1132 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1133 assert(VF.isVector() &&((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1134, __PRETTY_FUNCTION__))
1134 "Profitable to scalarize relevant only for VF > 1.")((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1134, __PRETTY_FUNCTION__))
;
1135
1136 // Cost model is not run in the VPlan-native path - return conservative
1137 // result until this changes.
1138 if (EnableVPlanNativePath)
1139 return false;
1140
1141 auto Scalars = InstsToScalarize.find(VF);
1142 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1143, __PRETTY_FUNCTION__))
1143 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1143, __PRETTY_FUNCTION__))
;
1144 return Scalars->second.find(I) != Scalars->second.end();
1145 }
1146
1147 /// Returns true if \p I is known to be uniform after vectorization.
1148 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1149 if (VF.isScalar())
1150 return true;
1151
1152 // Cost model is not run in the VPlan-native path - return conservative
1153 // result until this changes.
1154 if (EnableVPlanNativePath)
1155 return false;
1156
1157 auto UniformsPerVF = Uniforms.find(VF);
1158 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1159, __PRETTY_FUNCTION__))
1159 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1159, __PRETTY_FUNCTION__))
;
1160 return UniformsPerVF->second.count(I);
1161 }
1162
1163 /// Returns true if \p I is known to be scalar after vectorization.
1164 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1165 if (VF.isScalar())
1166 return true;
1167
1168 // Cost model is not run in the VPlan-native path - return conservative
1169 // result until this changes.
1170 if (EnableVPlanNativePath)
1171 return false;
1172
1173 auto ScalarsPerVF = Scalars.find(VF);
1174 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1175, __PRETTY_FUNCTION__))
1175 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1175, __PRETTY_FUNCTION__))
;
1176 return ScalarsPerVF->second.count(I);
1177 }
1178
1179 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1180 /// for vectorization factor \p VF.
1181 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1182 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1183 !isProfitableToScalarize(I, VF) &&
1184 !isScalarAfterVectorization(I, VF);
1185 }
1186
1187 /// Decision that was taken during cost calculation for memory instruction.
1188 enum InstWidening {
1189 CM_Unknown,
1190 CM_Widen, // For consecutive accesses with stride +1.
1191 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1192 CM_Interleave,
1193 CM_GatherScatter,
1194 CM_Scalarize
1195 };
1196
1197 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1198 /// instruction \p I and vector width \p VF.
1199 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1200 unsigned Cost) {
1201 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1201, __PRETTY_FUNCTION__))
;
1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1203 }
1204
1205 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1206 /// interleaving group \p Grp and vector width \p VF.
1207 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1208 ElementCount VF, InstWidening W, unsigned Cost) {
1209 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1209, __PRETTY_FUNCTION__))
;
1210 /// Broadcast this decicion to all instructions inside the group.
1211 /// But the cost will be assigned to one instruction only.
1212 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1213 if (auto *I = Grp->getMember(i)) {
1214 if (Grp->getInsertPos() == I)
1215 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1216 else
1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1218 }
1219 }
1220 }
1221
1222 /// Return the cost model decision for the given instruction \p I and vector
1223 /// width \p VF. Return CM_Unknown if this instruction did not pass
1224 /// through the cost modeling.
1225 InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1226 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1226, __PRETTY_FUNCTION__))
;
1227 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1227, __PRETTY_FUNCTION__))
;
1228
1229 // Cost model is not run in the VPlan-native path - return conservative
1230 // result until this changes.
1231 if (EnableVPlanNativePath)
1232 return CM_GatherScatter;
1233
1234 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1235 auto Itr = WideningDecisions.find(InstOnVF);
1236 if (Itr == WideningDecisions.end())
1237 return CM_Unknown;
1238 return Itr->second.first;
1239 }
1240
1241 /// Return the vectorization cost for the given instruction \p I and vector
1242 /// width \p VF.
1243 unsigned getWideningCost(Instruction *I, ElementCount VF) {
1244 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1244, __PRETTY_FUNCTION__))
;
1245 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1246 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1247, __PRETTY_FUNCTION__))
1247 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1247, __PRETTY_FUNCTION__))
;
1248 return WideningDecisions[InstOnVF].second;
1249 }
1250
1251 /// Return True if instruction \p I is an optimizable truncate whose operand
1252 /// is an induction variable. Such a truncate will be removed by adding a new
1253 /// induction variable with the destination type.
1254 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1255 // If the instruction is not a truncate, return false.
1256 auto *Trunc = dyn_cast<TruncInst>(I);
1257 if (!Trunc)
1258 return false;
1259
1260 // Get the source and destination types of the truncate.
1261 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1262 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1263
1264 // If the truncate is free for the given types, return false. Replacing a
1265 // free truncate with an induction variable would add an induction variable
1266 // update instruction to each iteration of the loop. We exclude from this
1267 // check the primary induction variable since it will need an update
1268 // instruction regardless.
1269 Value *Op = Trunc->getOperand(0);
1270 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1271 return false;
1272
1273 // If the truncated value is not an induction variable, return false.
1274 return Legal->isInductionPhi(Op);
1275 }
1276
1277 /// Collects the instructions to scalarize for each predicated instruction in
1278 /// the loop.
1279 void collectInstsToScalarize(ElementCount VF);
1280
1281 /// Collect Uniform and Scalar values for the given \p VF.
1282 /// The sets depend on CM decision for Load/Store instructions
1283 /// that may be vectorized as interleave, gather-scatter or scalarized.
1284 void collectUniformsAndScalars(ElementCount VF) {
1285 // Do the analysis once.
1286 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1287 return;
1288 setCostBasedWideningDecision(VF);
1289 collectLoopUniforms(VF);
1290 collectLoopScalars(VF);
1291 }
1292
1293 /// Returns true if the target machine supports masked store operation
1294 /// for the given \p DataType and kind of access to \p Ptr.
1295 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1296 return Legal->isConsecutivePtr(Ptr) &&
1297 TTI.isLegalMaskedStore(DataType, Alignment);
1298 }
1299
1300 /// Returns true if the target machine supports masked load operation
1301 /// for the given \p DataType and kind of access to \p Ptr.
1302 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1303 return Legal->isConsecutivePtr(Ptr) &&
1304 TTI.isLegalMaskedLoad(DataType, Alignment);
1305 }
1306
1307 /// Returns true if the target machine supports masked scatter operation
1308 /// for the given \p DataType.
1309 bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1310 return TTI.isLegalMaskedScatter(DataType, Alignment);
1311 }
1312
1313 /// Returns true if the target machine supports masked gather operation
1314 /// for the given \p DataType.
1315 bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1316 return TTI.isLegalMaskedGather(DataType, Alignment);
1317 }
1318
1319 /// Returns true if the target machine can represent \p V as a masked gather
1320 /// or scatter operation.
1321 bool isLegalGatherOrScatter(Value *V) {
1322 bool LI = isa<LoadInst>(V);
1323 bool SI = isa<StoreInst>(V);
1324 if (!LI && !SI)
1325 return false;
1326 auto *Ty = getMemInstValueType(V);
1327 Align Align = getLoadStoreAlignment(V);
1328 return (LI && isLegalMaskedGather(Ty, Align)) ||
1329 (SI && isLegalMaskedScatter(Ty, Align));
1330 }
1331
1332 /// Returns true if \p I is an instruction that will be scalarized with
1333 /// predication. Such instructions include conditional stores and
1334 /// instructions that may divide by zero.
1335 /// If a non-zero VF has been calculated, we check if I will be scalarized
1336 /// predication for that VF.
1337 bool isScalarWithPredication(Instruction *I,
1338 ElementCount VF = ElementCount::getFixed(1));
1339
1340 // Returns true if \p I is an instruction that will be predicated either
1341 // through scalar predication or masked load/store or masked gather/scatter.
1342 // Superset of instructions that return true for isScalarWithPredication.
1343 bool isPredicatedInst(Instruction *I) {
1344 if (!blockNeedsPredication(I->getParent()))
1345 return false;
1346 // Loads and stores that need some form of masked operation are predicated
1347 // instructions.
1348 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1349 return Legal->isMaskRequired(I);
1350 return isScalarWithPredication(I);
1351 }
1352
1353 /// Returns true if \p I is a memory instruction with consecutive memory
1354 /// access that can be widened.
1355 bool
1356 memoryInstructionCanBeWidened(Instruction *I,
1357 ElementCount VF = ElementCount::getFixed(1));
1358
1359 /// Returns true if \p I is a memory instruction in an interleaved-group
1360 /// of memory accesses that can be vectorized with wide vector loads/stores
1361 /// and shuffles.
1362 bool
1363 interleavedAccessCanBeWidened(Instruction *I,
1364 ElementCount VF = ElementCount::getFixed(1));
1365
1366 /// Check if \p Instr belongs to any interleaved access group.
1367 bool isAccessInterleaved(Instruction *Instr) {
1368 return InterleaveInfo.isInterleaved(Instr);
1369 }
1370
1371 /// Get the interleaved access group that \p Instr belongs to.
1372 const InterleaveGroup<Instruction> *
1373 getInterleavedAccessGroup(Instruction *Instr) {
1374 return InterleaveInfo.getInterleaveGroup(Instr);
1375 }
1376
1377 /// Returns true if an interleaved group requires a scalar iteration
1378 /// to handle accesses with gaps, and there is nothing preventing us from
1379 /// creating a scalar epilogue.
1380 bool requiresScalarEpilogue() const {
1381 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1382 }
1383
1384 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1385 /// loop hint annotation.
1386 bool isScalarEpilogueAllowed() const {
1387 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1388 }
1389
1390 /// Returns true if all loop blocks should be masked to fold tail loop.
1391 bool foldTailByMasking() const { return FoldTailByMasking; }
1392
1393 bool blockNeedsPredication(BasicBlock *BB) {
1394 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1395 }
1396
1397 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1398 /// nodes to the chain of instructions representing the reductions. Uses a
1399 /// MapVector to ensure deterministic iteration order.
1400 using ReductionChainMap =
1401 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1402
1403 /// Return the chain of instructions representing an inloop reduction.
1404 const ReductionChainMap &getInLoopReductionChains() const {
1405 return InLoopReductionChains;
1406 }
1407
1408 /// Returns true if the Phi is part of an inloop reduction.
1409 bool isInLoopReduction(PHINode *Phi) const {
1410 return InLoopReductionChains.count(Phi);
1411 }
1412
1413 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1414 /// with factor VF. Return the cost of the instruction, including
1415 /// scalarization overhead if it's needed.
1416 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1417
1418 /// Estimate cost of a call instruction CI if it were vectorized with factor
1419 /// VF. Return the cost of the instruction, including scalarization overhead
1420 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1421 /// scalarized -
1422 /// i.e. either vector version isn't available, or is too expensive.
1423 unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1424 bool &NeedToScalarize);
1425
1426 /// Invalidates decisions already taken by the cost model.
1427 void invalidateCostModelingDecisions() {
1428 WideningDecisions.clear();
1429 Uniforms.clear();
1430 Scalars.clear();
1431 }
1432
1433private:
1434 unsigned NumPredStores = 0;
1435
1436 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1437 /// than zero. One is returned if vectorization should best be avoided due
1438 /// to cost.
1439 unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1440
1441 /// The vectorization cost is a combination of the cost itself and a boolean
1442 /// indicating whether any of the contributing operations will actually
1443 /// operate on
1444 /// vector values after type legalization in the backend. If this latter value
1445 /// is
1446 /// false, then all operations will be scalarized (i.e. no vectorization has
1447 /// actually taken place).
1448 using VectorizationCostTy = std::pair<unsigned, bool>;
1449
1450 /// Returns the expected execution cost. The unit of the cost does
1451 /// not matter because we use the 'cost' units to compare different
1452 /// vector widths. The cost that is returned is *not* normalized by
1453 /// the factor width.
1454 VectorizationCostTy expectedCost(ElementCount VF);
1455
1456 /// Returns the execution time cost of an instruction for a given vector
1457 /// width. Vector width of one means scalar.
1458 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1459
1460 /// The cost-computation logic from getInstructionCost which provides
1461 /// the vector type as an output parameter.
1462 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1463
1464 /// Calculate vectorization cost of memory instruction \p I.
1465 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1466
1467 /// The cost computation for scalarized memory instruction.
1468 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1469
1470 /// The cost computation for interleaving group of memory instructions.
1471 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1472
1473 /// The cost computation for Gather/Scatter instruction.
1474 unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1475
1476 /// The cost computation for widening instruction \p I with consecutive
1477 /// memory access.
1478 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1479
1480 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1481 /// Load: scalar load + broadcast.
1482 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1483 /// element)
1484 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1485
1486 /// Estimate the overhead of scalarizing an instruction. This is a
1487 /// convenience wrapper for the type-based getScalarizationOverhead API.
1488 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1489
1490 /// Returns whether the instruction is a load or store and will be a emitted
1491 /// as a vector operation.
1492 bool isConsecutiveLoadOrStore(Instruction *I);
1493
1494 /// Returns true if an artificially high cost for emulated masked memrefs
1495 /// should be used.
1496 bool useEmulatedMaskMemRefHack(Instruction *I);
1497
1498 /// Map of scalar integer values to the smallest bitwidth they can be legally
1499 /// represented as. The vector equivalents of these values should be truncated
1500 /// to this type.
1501 MapVector<Instruction *, uint64_t> MinBWs;
1502
1503 /// A type representing the costs for instructions if they were to be
1504 /// scalarized rather than vectorized. The entries are Instruction-Cost
1505 /// pairs.
1506 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1507
1508 /// A set containing all BasicBlocks that are known to present after
1509 /// vectorization as a predicated block.
1510 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1511
1512 /// Records whether it is allowed to have the original scalar loop execute at
1513 /// least once. This may be needed as a fallback loop in case runtime
1514 /// aliasing/dependence checks fail, or to handle the tail/remainder
1515 /// iterations when the trip count is unknown or doesn't divide by the VF,
1516 /// or as a peel-loop to handle gaps in interleave-groups.
1517 /// Under optsize and when the trip count is very small we don't allow any
1518 /// iterations to execute in the scalar loop.
1519 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1520
1521 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1522 bool FoldTailByMasking = false;
1523
1524 /// A map holding scalar costs for different vectorization factors. The
1525 /// presence of a cost for an instruction in the mapping indicates that the
1526 /// instruction will be scalarized when vectorizing with the associated
1527 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1528 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1529
1530 /// Holds the instructions known to be uniform after vectorization.
1531 /// The data is collected per VF.
1532 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1533
1534 /// Holds the instructions known to be scalar after vectorization.
1535 /// The data is collected per VF.
1536 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1537
1538 /// Holds the instructions (address computations) that are forced to be
1539 /// scalarized.
1540 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1541
1542 /// PHINodes of the reductions that should be expanded in-loop along with
1543 /// their associated chains of reduction operations, in program order from top
1544 /// (PHI) to bottom
1545 ReductionChainMap InLoopReductionChains;
1546
1547 /// Returns the expected difference in cost from scalarizing the expression
1548 /// feeding a predicated instruction \p PredInst. The instructions to
1549 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1550 /// non-negative return value implies the expression will be scalarized.
1551 /// Currently, only single-use chains are considered for scalarization.
1552 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1553 ElementCount VF);
1554
1555 /// Collect the instructions that are uniform after vectorization. An
1556 /// instruction is uniform if we represent it with a single scalar value in
1557 /// the vectorized loop corresponding to each vector iteration. Examples of
1558 /// uniform instructions include pointer operands of consecutive or
1559 /// interleaved memory accesses. Note that although uniformity implies an
1560 /// instruction will be scalar, the reverse is not true. In general, a
1561 /// scalarized instruction will be represented by VF scalar values in the
1562 /// vectorized loop, each corresponding to an iteration of the original
1563 /// scalar loop.
1564 void collectLoopUniforms(ElementCount VF);
1565
1566 /// Collect the instructions that are scalar after vectorization. An
1567 /// instruction is scalar if it is known to be uniform or will be scalarized
1568 /// during vectorization. Non-uniform scalarized instructions will be
1569 /// represented by VF values in the vectorized loop, each corresponding to an
1570 /// iteration of the original scalar loop.
1571 void collectLoopScalars(ElementCount VF);
1572
1573 /// Keeps cost model vectorization decision and cost for instructions.
1574 /// Right now it is used for memory instructions only.
1575 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1576 std::pair<InstWidening, unsigned>>;
1577
1578 DecisionList WideningDecisions;
1579
1580 /// Returns true if \p V is expected to be vectorized and it needs to be
1581 /// extracted.
1582 bool needsExtract(Value *V, ElementCount VF) const {
1583 Instruction *I = dyn_cast<Instruction>(V);
1584 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1585 TheLoop->isLoopInvariant(I))
1586 return false;
1587
1588 // Assume we can vectorize V (and hence we need extraction) if the
1589 // scalars are not computed yet. This can happen, because it is called
1590 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1591 // the scalars are collected. That should be a safe assumption in most
1592 // cases, because we check if the operands have vectorizable types
1593 // beforehand in LoopVectorizationLegality.
1594 return Scalars.find(VF) == Scalars.end() ||
1595 !isScalarAfterVectorization(I, VF);
1596 };
1597
1598 /// Returns a range containing only operands needing to be extracted.
1599 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1600 ElementCount VF) {
1601 return SmallVector<Value *, 4>(make_filter_range(
1602 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1603 }
1604
1605public:
1606 /// The loop that we evaluate.
1607 Loop *TheLoop;
1608
1609 /// Predicated scalar evolution analysis.
1610 PredicatedScalarEvolution &PSE;
1611
1612 /// Loop Info analysis.
1613 LoopInfo *LI;
1614
1615 /// Vectorization legality.
1616 LoopVectorizationLegality *Legal;
1617
1618 /// Vector target information.
1619 const TargetTransformInfo &TTI;
1620
1621 /// Target Library Info.
1622 const TargetLibraryInfo *TLI;
1623
1624 /// Demanded bits analysis.
1625 DemandedBits *DB;
1626
1627 /// Assumption cache.
1628 AssumptionCache *AC;
1629
1630 /// Interface to emit optimization remarks.
1631 OptimizationRemarkEmitter *ORE;
1632
1633 const Function *TheFunction;
1634
1635 /// Loop Vectorize Hint.
1636 const LoopVectorizeHints *Hints;
1637
1638 /// The interleave access information contains groups of interleaved accesses
1639 /// with the same stride and close to each other.
1640 InterleavedAccessInfo &InterleaveInfo;
1641
1642 /// Values to ignore in the cost model.
1643 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1644
1645 /// Values to ignore in the cost model when VF > 1.
1646 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1647};
1648
1649} // end namespace llvm
1650
1651// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1652// vectorization. The loop needs to be annotated with #pragma omp simd
1653// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1654// vector length information is not provided, vectorization is not considered
1655// explicit. Interleave hints are not allowed either. These limitations will be
1656// relaxed in the future.
1657// Please, note that we are currently forced to abuse the pragma 'clang
1658// vectorize' semantics. This pragma provides *auto-vectorization hints*
1659// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1660// provides *explicit vectorization hints* (LV can bypass legal checks and
1661// assume that vectorization is legal). However, both hints are implemented
1662// using the same metadata (llvm.loop.vectorize, processed by
1663// LoopVectorizeHints). This will be fixed in the future when the native IR
1664// representation for pragma 'omp simd' is introduced.
1665static bool isExplicitVecOuterLoop(Loop *OuterLp,
1666 OptimizationRemarkEmitter *ORE) {
1667 assert(!OuterLp->isInnermost() && "This is not an outer loop")((!OuterLp->isInnermost() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1667, __PRETTY_FUNCTION__))
;
1668 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1669
1670 // Only outer loops with an explicit vectorization hint are supported.
1671 // Unannotated outer loops are ignored.
1672 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1673 return false;
1674
1675 Function *Fn = OuterLp->getHeader()->getParent();
1676 if (!Hints.allowVectorization(Fn, OuterLp,
1677 true /*VectorizeOnlyWhenForced*/)) {
1678 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
1679 return false;
1680 }
1681
1682 if (Hints.getInterleave() > 1) {
1683 // TODO: Interleave support is future work.
1684 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
1685 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
1686 Hints.emitRemarkWithHints();
1687 return false;
1688 }
1689
1690 return true;
1691}
1692
1693static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1694 OptimizationRemarkEmitter *ORE,
1695 SmallVectorImpl<Loop *> &V) {
1696 // Collect inner loops and outer loops without irreducible control flow. For
1697 // now, only collect outer loops that have explicit vectorization hints. If we
1698 // are stress testing the VPlan H-CFG construction, we collect the outermost
1699 // loop of every loop nest.
1700 if (L.isInnermost() || VPlanBuildStressTest ||
1701 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1702 LoopBlocksRPO RPOT(&L);
1703 RPOT.perform(LI);
1704 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1705 V.push_back(&L);
1706 // TODO: Collect inner loops inside marked outer loops in case
1707 // vectorization fails for the outer loop. Do not invoke
1708 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1709 // already known to be reducible. We can use an inherited attribute for
1710 // that.
1711 return;
1712 }
1713 }
1714 for (Loop *InnerL : L)
1715 collectSupportedLoops(*InnerL, LI, ORE, V);
1716}
1717
1718namespace {
1719
1720/// The LoopVectorize Pass.
1721struct LoopVectorize : public FunctionPass {
1722 /// Pass identification, replacement for typeid
1723 static char ID;
1724
1725 LoopVectorizePass Impl;
1726
1727 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1728 bool VectorizeOnlyWhenForced = false)
1729 : FunctionPass(ID),
1730 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1731 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1732 }
1733
1734 bool runOnFunction(Function &F) override {
1735 if (skipFunction(F))
1736 return false;
1737
1738 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1739 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1740 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1741 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1742 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1743 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1744 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1745 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1746 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1747 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1748 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1749 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1750 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1751
1752 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1753 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1754
1755 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1756 GetLAA, *ORE, PSI).MadeAnyChange;
1757 }
1758
1759 void getAnalysisUsage(AnalysisUsage &AU) const override {
1760 AU.addRequired<AssumptionCacheTracker>();
1761 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1762 AU.addRequired<DominatorTreeWrapperPass>();
1763 AU.addRequired<LoopInfoWrapperPass>();
1764 AU.addRequired<ScalarEvolutionWrapperPass>();
1765 AU.addRequired<TargetTransformInfoWrapperPass>();
1766 AU.addRequired<AAResultsWrapperPass>();
1767 AU.addRequired<LoopAccessLegacyAnalysis>();
1768 AU.addRequired<DemandedBitsWrapperPass>();
1769 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1770 AU.addRequired<InjectTLIMappingsLegacy>();
1771
1772 // We currently do not preserve loopinfo/dominator analyses with outer loop
1773 // vectorization. Until this is addressed, mark these analyses as preserved
1774 // only for non-VPlan-native path.
1775 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1776 if (!EnableVPlanNativePath) {
1777 AU.addPreserved<LoopInfoWrapperPass>();
1778 AU.addPreserved<DominatorTreeWrapperPass>();
1779 }
1780
1781 AU.addPreserved<BasicAAWrapperPass>();
1782 AU.addPreserved<GlobalsAAWrapperPass>();
1783 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1784 }
1785};
1786
1787} // end anonymous namespace
1788
1789//===----------------------------------------------------------------------===//
1790// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1791// LoopVectorizationCostModel and LoopVectorizationPlanner.
1792//===----------------------------------------------------------------------===//
1793
1794Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1795 // We need to place the broadcast of invariant variables outside the loop,
1796 // but only if it's proven safe to do so. Else, broadcast will be inside
1797 // vector loop body.
1798 Instruction *Instr = dyn_cast<Instruction>(V);
1799 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1800 (!Instr ||
1801 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1802 // Place the code for broadcasting invariant variables in the new preheader.
1803 IRBuilder<>::InsertPointGuard Guard(Builder);
1804 if (SafeToHoist)
1805 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1806
1807 // Broadcast the scalar into all locations in the vector.
1808 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1809
1810 return Shuf;
1811}
1812
1813void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1814 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1815 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1816, __PRETTY_FUNCTION__))
1816 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1816, __PRETTY_FUNCTION__))
;
1817 Value *Start = II.getStartValue();
1818
1819 // Construct the initial value of the vector IV in the vector loop preheader
1820 auto CurrIP = Builder.saveIP();
1821 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1822 if (isa<TruncInst>(EntryVal)) {
1823 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
1824 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
;
1825 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1826 Step = Builder.CreateTrunc(Step, TruncType);
1827 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1828 }
1829 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1830 Value *SteppedStart =
1831 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1832
1833 // We create vector phi nodes for both integer and floating-point induction
1834 // variables. Here, we determine the kind of arithmetic we will perform.
1835 Instruction::BinaryOps AddOp;
1836 Instruction::BinaryOps MulOp;
1837 if (Step->getType()->isIntegerTy()) {
1838 AddOp = Instruction::Add;
1839 MulOp = Instruction::Mul;
1840 } else {
1841 AddOp = II.getInductionOpcode();
1842 MulOp = Instruction::FMul;
1843 }
1844
1845 // Multiply the vectorization factor by the step using integer or
1846 // floating-point arithmetic as appropriate.
1847 Value *ConstVF =
1848 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1849 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1850
1851 // Create a vector splat to use in the induction update.
1852 //
1853 // FIXME: If the step is non-constant, we create the vector splat with
1854 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1855 // handle a constant vector splat.
1856 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1856, __PRETTY_FUNCTION__))
;
1857 Value *SplatVF = isa<Constant>(Mul)
1858 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1859 : Builder.CreateVectorSplat(VF, Mul);
1860 Builder.restoreIP(CurrIP);
1861
1862 // We may need to add the step a number of times, depending on the unroll
1863 // factor. The last of those goes into the PHI.
1864 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1865 &*LoopVectorBody->getFirstInsertionPt());
1866 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1867 Instruction *LastInduction = VecInd;
1868 for (unsigned Part = 0; Part < UF; ++Part) {
1869 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1870
1871 if (isa<TruncInst>(EntryVal))
1872 addMetadata(LastInduction, EntryVal);
1873 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1874
1875 LastInduction = cast<Instruction>(addFastMathFlag(
1876 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1877 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1878 }
1879
1880 // Move the last step to the end of the latch block. This ensures consistent
1881 // placement of all induction updates.
1882 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1883 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1884 auto *ICmp = cast<Instruction>(Br->getCondition());
1885 LastInduction->moveBefore(ICmp);
1886 LastInduction->setName("vec.ind.next");
1887
1888 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1889 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1890}
1891
1892bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1893 return Cost->isScalarAfterVectorization(I, VF) ||
1894 Cost->isProfitableToScalarize(I, VF);
1895}
1896
1897bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1898 if (shouldScalarizeInstruction(IV))
1899 return true;
1900 auto isScalarInst = [&](User *U) -> bool {
1901 auto *I = cast<Instruction>(U);
1902 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1903 };
1904 return llvm::any_of(IV->users(), isScalarInst);
1905}
1906
1907void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1908 const InductionDescriptor &ID, const Instruction *EntryVal,
1909 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1910 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1911, __PRETTY_FUNCTION__))
1911 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1911, __PRETTY_FUNCTION__))
;
1912
1913 // This induction variable is not the phi from the original loop but the
1914 // newly-created IV based on the proof that casted Phi is equal to the
1915 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1916 // re-uses the same InductionDescriptor that original IV uses but we don't
1917 // have to do any recording in this case - that is done when original IV is
1918 // processed.
1919 if (isa<TruncInst>(EntryVal))
1920 return;
1921
1922 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1923 if (Casts.empty())
1924 return;
1925 // Only the first Cast instruction in the Casts vector is of interest.
1926 // The rest of the Casts (if exist) have no uses outside the
1927 // induction update chain itself.
1928 Instruction *CastInst = *Casts.begin();
1929 if (Lane < UINT_MAX(2147483647 *2U +1U))
1930 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1931 else
1932 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1933}
1934
1935void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1936 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1937, __PRETTY_FUNCTION__))
1937 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1937, __PRETTY_FUNCTION__))
;
1938
1939 auto II = Legal->getInductionVars().find(IV);
1940 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1940, __PRETTY_FUNCTION__))
;
1941
1942 auto ID = II->second;
1943 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1943, __PRETTY_FUNCTION__))
;
1944
1945 // The value from the original loop to which we are mapping the new induction
1946 // variable.
1947 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1948
1949 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1950
1951 // Generate code for the induction step. Note that induction steps are
1952 // required to be loop-invariant
1953 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1954 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__))
1955 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__))
;
1956 if (PSE.getSE()->isSCEVable(IV->getType())) {
1957 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1958 return Exp.expandCodeFor(Step, Step->getType(),
1959 LoopVectorPreHeader->getTerminator());
1960 }
1961 return cast<SCEVUnknown>(Step)->getValue();
1962 };
1963
1964 // The scalar value to broadcast. This is derived from the canonical
1965 // induction variable. If a truncation type is given, truncate the canonical
1966 // induction variable and step. Otherwise, derive these values from the
1967 // induction descriptor.
1968 auto CreateScalarIV = [&](Value *&Step) -> Value * {
1969 Value *ScalarIV = Induction;
1970 if (IV != OldInduction) {
1971 ScalarIV = IV->getType()->isIntegerTy()
1972 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1973 : Builder.CreateCast(Instruction::SIToFP, Induction,
1974 IV->getType());
1975 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1976 ScalarIV->setName("offset.idx");
1977 }
1978 if (Trunc) {
1979 auto *TruncType = cast<IntegerType>(Trunc->getType());
1980 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1981, __PRETTY_FUNCTION__))
1981 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1981, __PRETTY_FUNCTION__))
;
1982 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1983 Step = Builder.CreateTrunc(Step, TruncType);
1984 }
1985 return ScalarIV;
1986 };
1987
1988 // Create the vector values from the scalar IV, in the absence of creating a
1989 // vector IV.
1990 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1991 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1992 for (unsigned Part = 0; Part < UF; ++Part) {
1993 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1993, __PRETTY_FUNCTION__))
;
1994 Value *EntryPart =
1995 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1996 ID.getInductionOpcode());
1997 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1998 if (Trunc)
1999 addMetadata(EntryPart, Trunc);
2000 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2001 }
2002 };
2003
2004 // Now do the actual transformations, and start with creating the step value.
2005 Value *Step = CreateStepValue(ID.getStep());
2006 if (VF.isZero() || VF.isScalar()) {
2007 Value *ScalarIV = CreateScalarIV(Step);
2008 CreateSplatIV(ScalarIV, Step);
2009 return;
2010 }
2011
2012 // Determine if we want a scalar version of the induction variable. This is
2013 // true if the induction variable itself is not widened, or if it has at
2014 // least one user in the loop that is not widened.
2015 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2016 if (!NeedsScalarIV) {
2017 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2018 return;
2019 }
2020
2021 // Try to create a new independent vector induction variable. If we can't
2022 // create the phi node, we will splat the scalar induction variable in each
2023 // loop iteration.
2024 if (!shouldScalarizeInstruction(EntryVal)) {
2025 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2026 Value *ScalarIV = CreateScalarIV(Step);
2027 // Create scalar steps that can be used by instructions we will later
2028 // scalarize. Note that the addition of the scalar steps will not increase
2029 // the number of instructions in the loop in the common case prior to
2030 // InstCombine. We will be trading one vector extract for each scalar step.
2031 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2032 return;
2033 }
2034
2035 // All IV users are scalar instructions, so only emit a scalar IV, not a
2036 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2037 // predicate used by the masked loads/stores.
2038 Value *ScalarIV = CreateScalarIV(Step);
2039 if (!Cost->isScalarEpilogueAllowed())
2040 CreateSplatIV(ScalarIV, Step);
2041 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2042}
2043
2044Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2045 Instruction::BinaryOps BinOp) {
2046 // Create and check the types.
2047 auto *ValVTy = cast<FixedVectorType>(Val->getType());
2048 int VLen = ValVTy->getNumElements();
2049
2050 Type *STy = Val->getType()->getScalarType();
2051 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2052, __PRETTY_FUNCTION__))
2052 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2052, __PRETTY_FUNCTION__))
;
2053 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2053, __PRETTY_FUNCTION__))
;
2054
2055 SmallVector<Constant *, 8> Indices;
2056
2057 if (STy->isIntegerTy()) {
2058 // Create a vector of consecutive numbers from zero to VF.
2059 for (int i = 0; i < VLen; ++i)
2060 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2061
2062 // Add the consecutive indices to the vector value.
2063 Constant *Cv = ConstantVector::get(Indices);
2064 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2064, __PRETTY_FUNCTION__))
;
2065 Step = Builder.CreateVectorSplat(VLen, Step);
2066 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2066, __PRETTY_FUNCTION__))
;
2067 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2068 // which can be found from the original scalar operations.
2069 Step = Builder.CreateMul(Cv, Step);
2070 return Builder.CreateAdd(Val, Step, "induction");
2071 }
2072
2073 // Floating point induction.
2074 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
2075 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
;
2076 // Create a vector of consecutive numbers from zero to VF.
2077 for (int i = 0; i < VLen; ++i)
2078 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2079
2080 // Add the consecutive indices to the vector value.
2081 Constant *Cv = ConstantVector::get(Indices);
2082
2083 Step = Builder.CreateVectorSplat(VLen, Step);
2084
2085 // Floating point operations had to be 'fast' to enable the induction.
2086 FastMathFlags Flags;
2087 Flags.setFast();
2088
2089 Value *MulOp = Builder.CreateFMul(Cv, Step);
2090 if (isa<Instruction>(MulOp))
2091 // Have to check, MulOp may be a constant
2092 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2093
2094 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2095 if (isa<Instruction>(BOp))
2096 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2097 return BOp;
2098}
2099
2100void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2101 Instruction *EntryVal,
2102 const InductionDescriptor &ID) {
2103 // We shouldn't have to build scalar steps if we aren't vectorizing.
2104 assert(VF.isVector() && "VF should be greater than one")((VF.isVector() && "VF should be greater than one") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2104, __PRETTY_FUNCTION__))
;
2105 assert(!VF.isScalable() &&((!VF.isScalable() && "the code below assumes a fixed number of elements at compile time"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes a fixed number of elements at compile time\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2106, __PRETTY_FUNCTION__))
2106 "the code below assumes a fixed number of elements at compile time")((!VF.isScalable() && "the code below assumes a fixed number of elements at compile time"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes a fixed number of elements at compile time\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2106, __PRETTY_FUNCTION__))
;
2107 // Get the value type and ensure it and the step have the same integer type.
2108 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2109 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2110, __PRETTY_FUNCTION__))
2110 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2110, __PRETTY_FUNCTION__))
;
2111
2112 // We build scalar steps for both integer and floating-point induction
2113 // variables. Here, we determine the kind of arithmetic we will perform.
2114 Instruction::BinaryOps AddOp;
2115 Instruction::BinaryOps MulOp;
2116 if (ScalarIVTy->isIntegerTy()) {
2117 AddOp = Instruction::Add;
2118 MulOp = Instruction::Mul;
2119 } else {
2120 AddOp = ID.getInductionOpcode();
2121 MulOp = Instruction::FMul;
2122 }
2123
2124 // Determine the number of scalars we need to generate for each unroll
2125 // iteration. If EntryVal is uniform, we only need to generate the first
2126 // lane. Otherwise, we generate all VF values.
2127 unsigned Lanes =
2128 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2129 ? 1
2130 : VF.getKnownMinValue();
2131 // Compute the scalar steps and save the results in VectorLoopValueMap.
2132 for (unsigned Part = 0; Part < UF; ++Part) {
2133 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2134 auto *StartIdx = getSignedIntOrFpConstant(
2135 ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2136 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2137 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2138 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2139 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2140 }
2141 }
2142}
2143
2144Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2145 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2145, __PRETTY_FUNCTION__))
;
2146 assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2146, __PRETTY_FUNCTION__))
;
2147 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2147, __PRETTY_FUNCTION__))
;
2148
2149 // If we have a stride that is replaced by one, do it here. Defer this for
2150 // the VPlan-native path until we start running Legal checks in that path.
2151 if (!EnableVPlanNativePath && Legal->hasStride(V))
2152 V = ConstantInt::get(V->getType(), 1);
2153
2154 // If we have a vector mapped to this value, return it.
2155 if (VectorLoopValueMap.hasVectorValue(V, Part))
2156 return VectorLoopValueMap.getVectorValue(V, Part);
2157
2158 // If the value has not been vectorized, check if it has been scalarized
2159 // instead. If it has been scalarized, and we actually need the value in
2160 // vector form, we will construct the vector values on demand.
2161 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2162 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2163
2164 // If we've scalarized a value, that value should be an instruction.
2165 auto *I = cast<Instruction>(V);
2166
2167 // If we aren't vectorizing, we can just copy the scalar map values over to
2168 // the vector map.
2169 if (VF == 1) {
2170 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2171 return ScalarValue;
2172 }
2173
2174 // Get the last scalar instruction we generated for V and Part. If the value
2175 // is known to be uniform after vectorization, this corresponds to lane zero
2176 // of the Part unroll iteration. Otherwise, the last instruction is the one
2177 // we created for the last vector lane of the Part unroll iteration.
2178 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2178, __PRETTY_FUNCTION__))
;
2179 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2180 ? 0
2181 : VF.getKnownMinValue() - 1;
2182 auto *LastInst = cast<Instruction>(
2183 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2184
2185 // Set the insert point after the last scalarized instruction. This ensures
2186 // the insertelement sequence will directly follow the scalar definitions.
2187 auto OldIP = Builder.saveIP();
2188 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2189 Builder.SetInsertPoint(&*NewIP);
2190
2191 // However, if we are vectorizing, we need to construct the vector values.
2192 // If the value is known to be uniform after vectorization, we can just
2193 // broadcast the scalar value corresponding to lane zero for each unroll
2194 // iteration. Otherwise, we construct the vector values using insertelement
2195 // instructions. Since the resulting vectors are stored in
2196 // VectorLoopValueMap, we will only generate the insertelements once.
2197 Value *VectorValue = nullptr;
2198 if (Cost->isUniformAfterVectorization(I, VF)) {
2199 VectorValue = getBroadcastInstrs(ScalarValue);
2200 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2201 } else {
2202 // Initialize packing with insertelements to start from undef.
2203 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2203, __PRETTY_FUNCTION__))
;
2204 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2205 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2206 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2207 packScalarIntoVectorValue(V, {Part, Lane});
2208 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2209 }
2210 Builder.restoreIP(OldIP);
2211 return VectorValue;
2212 }
2213
2214 // If this scalar is unknown, assume that it is a constant or that it is
2215 // loop invariant. Broadcast V and save the value for future uses.
2216 Value *B = getBroadcastInstrs(V);
2217 VectorLoopValueMap.setVectorValue(V, Part, B);
2218 return B;
2219}
2220
2221Value *
2222InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2223 const VPIteration &Instance) {
2224 // If the value is not an instruction contained in the loop, it should
2225 // already be scalar.
2226 if (OrigLoop->isLoopInvariant(V))
2227 return V;
2228
2229 assert(Instance.Lane > 0((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
2230 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
2231 : true && "Uniform values only have lane zero")((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
;
2232
2233 // If the value from the original loop has not been vectorized, it is
2234 // represented by UF x VF scalar values in the new loop. Return the requested
2235 // scalar value.
2236 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2237 return VectorLoopValueMap.getScalarValue(V, Instance);
2238
2239 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2240 // for the given unroll part. If this entry is not a vector type (i.e., the
2241 // vectorization factor is one), there is no need to generate an
2242 // extractelement instruction.
2243 auto *U = getOrCreateVectorValue(V, Instance.Part);
2244 if (!U->getType()->isVectorTy()) {
2245 assert(VF == 1 && "Value not scalarized has non-vector type")((VF == 1 && "Value not scalarized has non-vector type"
) ? static_cast<void> (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2245, __PRETTY_FUNCTION__))
;
2246 return U;
2247 }
2248
2249 // Otherwise, the value from the original loop has been vectorized and is
2250 // represented by UF vector values. Extract and return the requested scalar
2251 // value from the appropriate vector lane.
2252 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2253}
2254
2255void InnerLoopVectorizer::packScalarIntoVectorValue(
2256 Value *V, const VPIteration &Instance) {
2257 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2257, __PRETTY_FUNCTION__))
;
2258 assert(!V->getType()->isVectorTy() && "Can't pack a vector")((!V->getType()->isVectorTy() && "Can't pack a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2258, __PRETTY_FUNCTION__))
;
2259 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2259, __PRETTY_FUNCTION__))
;
2260
2261 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2262 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2263 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2264 Builder.getInt32(Instance.Lane));
2265 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2266}
2267
2268Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2269 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2269, __PRETTY_FUNCTION__))
;
2270 assert(!VF.isScalable() && "Cannot reverse scalable vectors")((!VF.isScalable() && "Cannot reverse scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Cannot reverse scalable vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2270, __PRETTY_FUNCTION__))
;
2271 SmallVector<int, 8> ShuffleMask;
2272 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2273 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2274
2275 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2276}
2277
2278// Return whether we allow using masked interleave-groups (for dealing with
2279// strided loads/stores that reside in predicated blocks, or for dealing
2280// with gaps).
2281static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2282 // If an override option has been passed in for interleaved accesses, use it.
2283 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2284 return EnableMaskedInterleavedMemAccesses;
2285
2286 return TTI.enableMaskedInterleavedAccessVectorization();
2287}
2288
2289// Try to vectorize the interleave group that \p Instr belongs to.
2290//
2291// E.g. Translate following interleaved load group (factor = 3):
2292// for (i = 0; i < N; i+=3) {
2293// R = Pic[i]; // Member of index 0
2294// G = Pic[i+1]; // Member of index 1
2295// B = Pic[i+2]; // Member of index 2
2296// ... // do something to R, G, B
2297// }
2298// To:
2299// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2300// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2301// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2302// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2303//
2304// Or translate following interleaved store group (factor = 3):
2305// for (i = 0; i < N; i+=3) {
2306// ... do something to R, G, B
2307// Pic[i] = R; // Member of index 0
2308// Pic[i+1] = G; // Member of index 1
2309// Pic[i+2] = B; // Member of index 2
2310// }
2311// To:
2312// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2313// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2314// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2315// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2316// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2317void InnerLoopVectorizer::vectorizeInterleaveGroup(
2318 const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2319 VPValue *Addr, VPValue *BlockInMask) {
2320 Instruction *Instr = Group->getInsertPos();
2321 const DataLayout &DL = Instr->getModule()->getDataLayout();
2322
2323 // Prepare for the vector type of the interleaved load/store.
2324 Type *ScalarTy = getMemInstValueType(Instr);
2325 unsigned InterleaveFactor = Group->getFactor();
2326 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2326, __PRETTY_FUNCTION__))
;
2327 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2328
2329 // Prepare for the new pointers.
2330 SmallVector<Value *, 2> AddrParts;
2331 unsigned Index = Group->getIndex(Instr);
2332
2333 // TODO: extend the masked interleaved-group support to reversed access.
2334 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2335, __PRETTY_FUNCTION__))
2335 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2335, __PRETTY_FUNCTION__))
;
2336
2337 // If the group is reverse, adjust the index to refer to the last vector lane
2338 // instead of the first. We adjust the index from the first vector lane,
2339 // rather than directly getting the pointer for lane VF - 1, because the
2340 // pointer operand of the interleaved access is supposed to be uniform. For
2341 // uniform instructions, we're only required to generate a value for the
2342 // first vector lane in each unroll iteration.
2343 assert(!VF.isScalable() &&((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2344, __PRETTY_FUNCTION__))
2344 "scalable vector reverse operation is not implemented")((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2344, __PRETTY_FUNCTION__))
;
2345 if (Group->isReverse())
2346 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2347
2348 for (unsigned Part = 0; Part < UF; Part++) {
2349 Value *AddrPart = State.get(Addr, {Part, 0});
2350 setDebugLocFromInst(Builder, AddrPart);
2351
2352 // Notice current instruction could be any index. Need to adjust the address
2353 // to the member of index 0.
2354 //
2355 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2356 // b = A[i]; // Member of index 0
2357 // Current pointer is pointed to A[i+1], adjust it to A[i].
2358 //
2359 // E.g. A[i+1] = a; // Member of index 1
2360 // A[i] = b; // Member of index 0
2361 // A[i+2] = c; // Member of index 2 (Current instruction)
2362 // Current pointer is pointed to A[i+2], adjust it to A[i].
2363
2364 bool InBounds = false;
2365 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2366 InBounds = gep->isInBounds();
2367 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2368 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2369
2370 // Cast to the vector pointer type.
2371 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2372 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2373 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2374 }
2375
2376 setDebugLocFromInst(Builder, Instr);
2377 Value *UndefVec = UndefValue::get(VecTy);
2378
2379 Value *MaskForGaps = nullptr;
2380 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2381 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2381, __PRETTY_FUNCTION__))
;
2382 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2383 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__))
;
2384 }
2385
2386 // Vectorize the interleaved load group.
2387 if (isa<LoadInst>(Instr)) {
2388 // For each unroll part, create a wide load for the group.
2389 SmallVector<Value *, 2> NewLoads;
2390 for (unsigned Part = 0; Part < UF; Part++) {
2391 Instruction *NewLoad;
2392 if (BlockInMask || MaskForGaps) {
2393 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __PRETTY_FUNCTION__))
2394 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __PRETTY_FUNCTION__))
;
2395 Value *GroupMask = MaskForGaps;
2396 if (BlockInMask) {
2397 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2398 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2398, __PRETTY_FUNCTION__))
;
2399 Value *ShuffledMask = Builder.CreateShuffleVector(
2400 BlockInMaskPart,
2401 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2402 "interleaved.mask");
2403 GroupMask = MaskForGaps
2404 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2405 MaskForGaps)
2406 : ShuffledMask;
2407 }
2408 NewLoad =
2409 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2410 GroupMask, UndefVec, "wide.masked.vec");
2411 }
2412 else
2413 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2414 Group->getAlign(), "wide.vec");
2415 Group->addMetadata(NewLoad);
2416 NewLoads.push_back(NewLoad);
2417 }
2418
2419 // For each member in the group, shuffle out the appropriate data from the
2420 // wide loads.
2421 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2422 Instruction *Member = Group->getMember(I);
2423
2424 // Skip the gaps in the group.
2425 if (!Member)
2426 continue;
2427
2428 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2428, __PRETTY_FUNCTION__))
;
2429 auto StrideMask =
2430 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2431 for (unsigned Part = 0; Part < UF; Part++) {
2432 Value *StridedVec = Builder.CreateShuffleVector(
2433 NewLoads[Part], StrideMask, "strided.vec");
2434
2435 // If this member has different type, cast the result type.
2436 if (Member->getType() != ScalarTy) {
2437 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2437, __PRETTY_FUNCTION__))
;
2438 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2439 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2440 }
2441
2442 if (Group->isReverse())
2443 StridedVec = reverseVector(StridedVec);
2444
2445 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2446 }
2447 }
2448 return;
2449 }
2450
2451 // The sub vector type for current instruction.
2452 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2452, __PRETTY_FUNCTION__))
;
2453 auto *SubVT = VectorType::get(ScalarTy, VF);
2454
2455 // Vectorize the interleaved store group.
2456 for (unsigned Part = 0; Part < UF; Part++) {
2457 // Collect the stored vector from each member.
2458 SmallVector<Value *, 4> StoredVecs;
2459 for (unsigned i = 0; i < InterleaveFactor; i++) {
2460 // Interleaved store group doesn't allow a gap, so each index has a member
2461 Instruction *Member = Group->getMember(i);
2462 assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2462, __PRETTY_FUNCTION__))
;
2463
2464 Value *StoredVec = getOrCreateVectorValue(
2465 cast<StoreInst>(Member)->getValueOperand(), Part);
2466 if (Group->isReverse())
2467 StoredVec = reverseVector(StoredVec);
2468
2469 // If this member has different type, cast it to a unified type.
2470
2471 if (StoredVec->getType() != SubVT)
2472 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2473
2474 StoredVecs.push_back(StoredVec);
2475 }
2476
2477 // Concatenate all vectors into a wide vector.
2478 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2479
2480 // Interleave the elements in the wide vector.
2481 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2481, __PRETTY_FUNCTION__))
;
2482 Value *IVec = Builder.CreateShuffleVector(
2483 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2484 "interleaved.vec");
2485
2486 Instruction *NewStoreInstr;
2487 if (BlockInMask) {
2488 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2489 Value *ShuffledMask = Builder.CreateShuffleVector(
2490 BlockInMaskPart,
2491 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2492 "interleaved.mask");
2493 NewStoreInstr = Builder.CreateMaskedStore(
2494 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2495 }
2496 else
2497 NewStoreInstr =
2498 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2499
2500 Group->addMetadata(NewStoreInstr);
2501 }
2502}
2503
2504void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2505 VPTransformState &State,
2506 VPValue *Addr,
2507 VPValue *StoredValue,
2508 VPValue *BlockInMask) {
2509 // Attempt to issue a wide load.
2510 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2511 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2512
2513 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2513, __PRETTY_FUNCTION__))
;
2514 assert((!SI || StoredValue) && "No stored value provided for widened store")(((!SI || StoredValue) && "No stored value provided for widened store"
) ? static_cast<void> (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2514, __PRETTY_FUNCTION__))
;
2515 assert((!LI || !StoredValue) && "Stored value provided for widened load")(((!LI || !StoredValue) && "Stored value provided for widened load"
) ? static_cast<void> (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2515, __PRETTY_FUNCTION__))
;
2516
2517 LoopVectorizationCostModel::InstWidening Decision =
2518 Cost->getWideningDecision(Instr, VF);
2519 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2520 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2521 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2522 "CM decision is not to widen the memory instruction")(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
;
2523
2524 Type *ScalarDataTy = getMemInstValueType(Instr);
2525
2526 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2526, __PRETTY_FUNCTION__))
;
2527 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2528 const Align Alignment = getLoadStoreAlignment(Instr);
2529
2530 // Determine if the pointer operand of the access is either consecutive or
2531 // reverse consecutive.
2532 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2533 bool ConsecutiveStride =
2534 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2535 bool CreateGatherScatter =
2536 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2537
2538 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2539 // gather/scatter. Otherwise Decision should have been to Scalarize.
2540 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2541, __PRETTY_FUNCTION__))
2541 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2541, __PRETTY_FUNCTION__))
;
2542 (void)ConsecutiveStride;
2543
2544 VectorParts BlockInMaskParts(UF);
2545 bool isMaskRequired = BlockInMask;
2546 if (isMaskRequired)
2547 for (unsigned Part = 0; Part < UF; ++Part)
2548 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2549
2550 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2551 // Calculate the pointer for the specific unroll-part.
2552 GetElementPtrInst *PartPtr = nullptr;
2553
2554 bool InBounds = false;
2555 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2556 InBounds = gep->isInBounds();
2557
2558 if (Reverse) {
2559 // If the address is consecutive but reversed, then the
2560 // wide store needs to start at the last vector element.
2561 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2562 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2563 PartPtr->setIsInBounds(InBounds);
2564 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2565 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2566 PartPtr->setIsInBounds(InBounds);
2567 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2568 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2569 } else {
2570 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2571 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2572 PartPtr->setIsInBounds(InBounds);
2573 }
2574
2575 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2576 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2577 };
2578
2579 // Handle Stores:
2580 if (SI) {
2581 setDebugLocFromInst(Builder, SI);
2582
2583 for (unsigned Part = 0; Part < UF; ++Part) {
2584 Instruction *NewSI = nullptr;
2585 Value *StoredVal = State.get(StoredValue, Part);
2586 if (CreateGatherScatter) {
2587 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2588 Value *VectorGep = State.get(Addr, Part);
2589 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2590 MaskPart);
2591 } else {
2592 if (Reverse) {
2593 // If we store to reverse consecutive memory locations, then we need
2594 // to reverse the order of elements in the stored value.
2595 StoredVal = reverseVector(StoredVal);
2596 // We don't want to update the value in the map as it might be used in
2597 // another expression. So don't call resetVectorValue(StoredVal).
2598 }
2599 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2600 if (isMaskRequired)
2601 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2602 BlockInMaskParts[Part]);
2603 else
2604 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2605 }
2606 addMetadata(NewSI, SI);
2607 }
2608 return;
2609 }
2610
2611 // Handle loads.
2612 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2612, __PRETTY_FUNCTION__))
;
2613 setDebugLocFromInst(Builder, LI);
2614 for (unsigned Part = 0; Part < UF; ++Part) {
2615 Value *NewLI;
2616 if (CreateGatherScatter) {
2617 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2618 Value *VectorGep = State.get(Addr, Part);
2619 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2620 nullptr, "wide.masked.gather");
2621 addMetadata(NewLI, LI);
2622 } else {
2623 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2624 if (isMaskRequired)
2625 NewLI = Builder.CreateMaskedLoad(
2626 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2627 "wide.masked.load");
2628 else
2629 NewLI =
2630 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2631
2632 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2633 addMetadata(NewLI, LI);
2634 if (Reverse)
2635 NewLI = reverseVector(NewLI);
2636 }
2637 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2638 }
2639}
2640
2641void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2642 const VPIteration &Instance,
2643 bool IfPredicateInstr,
2644 VPTransformState &State) {
2645 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2645, __PRETTY_FUNCTION__))
;
2646
2647 setDebugLocFromInst(Builder, Instr);
2648
2649 // Does this instruction return a value ?
2650 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2651
2652 Instruction *Cloned = Instr->clone();
2653 if (!IsVoidRetTy)
2654 Cloned->setName(Instr->getName() + ".cloned");
2655
2656 // Replace the operands of the cloned instructions with their scalar
2657 // equivalents in the new loop.
2658 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2659 auto *NewOp = State.get(User.getOperand(op), Instance);
2660 Cloned->setOperand(op, NewOp);
2661 }
2662 addNewMetadata(Cloned, Instr);
2663
2664 // Place the cloned scalar in the new loop.
2665 Builder.Insert(Cloned);
2666
2667 // Add the cloned scalar to the scalar map entry.
2668 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2669
2670 // If we just cloned a new assumption, add it the assumption cache.
2671 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2672 if (II->getIntrinsicID() == Intrinsic::assume)
2673 AC->registerAssumption(II);
2674
2675 // End if-block.
2676 if (IfPredicateInstr)
2677 PredicatedInstructions.push_back(Cloned);
2678}
2679
2680PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2681 Value *End, Value *Step,
2682 Instruction *DL) {
2683 BasicBlock *Header = L->getHeader();
2684 BasicBlock *Latch = L->getLoopLatch();
2685 // As we're just creating this loop, it's possible no latch exists
2686 // yet. If so, use the header as this will be a single block loop.
2687 if (!Latch)
2688 Latch = Header;
2689
2690 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2691 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2692 setDebugLocFromInst(Builder, OldInst);
2693 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2694
2695 Builder.SetInsertPoint(Latch->getTerminator());
2696 setDebugLocFromInst(Builder, OldInst);
2697
2698 // Create i+1 and fill the PHINode.
2699 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2700 Induction->addIncoming(Start, L->getLoopPreheader());
2701 Induction->addIncoming(Next, Latch);
2702 // Create the compare.
2703 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2704 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2705
2706 // Now we have two terminators. Remove the old one from the block.
2707 Latch->getTerminator()->eraseFromParent();
2708
2709 return Induction;
2710}
2711
2712Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2713 if (TripCount)
2714 return TripCount;
2715
2716 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2716, __PRETTY_FUNCTION__))
;
2717 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2718 // Find the loop boundaries.
2719 ScalarEvolution *SE = PSE.getSE();
2720 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2721 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2722, __PRETTY_FUNCTION__))
2722 "Invalid loop count")((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2722, __PRETTY_FUNCTION__))
;
2723
2724 Type *IdxTy = Legal->getWidestInductionType();
2725 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2725, __PRETTY_FUNCTION__))
;
2726
2727 // The exit count might have the type of i64 while the phi is i32. This can
2728 // happen if we have an induction variable that is sign extended before the
2729 // compare. The only way that we get a backedge taken count is that the
2730 // induction variable was signed and as such will not overflow. In such a case
2731 // truncation is legal.
2732 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2733 IdxTy->getPrimitiveSizeInBits())
2734 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2735 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2736
2737 // Get the total trip count from the count by adding 1.
2738 const SCEV *ExitCount = SE->getAddExpr(
2739 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2740
2741 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2742
2743 // Expand the trip count and place the new instructions in the preheader.
2744 // Notice that the pre-header does not change, only the loop body.
2745 SCEVExpander Exp(*SE, DL, "induction");
2746
2747 // Count holds the overall loop count (N).
2748 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2749 L->getLoopPreheader()->getTerminator());
2750
2751 if (TripCount->getType()->isPointerTy())
2752 TripCount =
2753 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2754 L->getLoopPreheader()->getTerminator());
2755
2756 return TripCount;
2757}
2758
2759Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2760 if (VectorTripCount)
2761 return VectorTripCount;
2762
2763 Value *TC = getOrCreateTripCount(L);
2764 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2765
2766 Type *Ty = TC->getType();
2767 // This is where we can make the step a runtime constant.
2768 assert(!VF.isScalable() && "scalable vectorization is not supported yet")((!VF.isScalable() && "scalable vectorization is not supported yet"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectorization is not supported yet\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2768, __PRETTY_FUNCTION__))
;
2769 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2770
2771 // If the tail is to be folded by masking, round the number of iterations N
2772 // up to a multiple of Step instead of rounding down. This is done by first
2773 // adding Step-1 and then rounding down. Note that it's ok if this addition
2774 // overflows: the vector induction variable will eventually wrap to zero given
2775 // that it starts at zero and its Step is a power of two; the loop will then
2776 // exit, with the last early-exit vector comparison also producing all-true.
2777 if (Cost->foldTailByMasking()) {
2778 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
2779 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
;
2780 TC = Builder.CreateAdd(
2781 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2782 }
2783
2784 // Now we need to generate the expression for the part of the loop that the
2785 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2786 // iterations are not required for correctness, or N - Step, otherwise. Step
2787 // is equal to the vectorization factor (number of SIMD elements) times the
2788 // unroll factor (number of SIMD instructions).
2789 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2790
2791 // If there is a non-reversed interleaved group that may speculatively access
2792 // memory out-of-bounds, we need to ensure that there will be at least one
2793 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2794 // the trip count, we set the remainder to be equal to the step. If the step
2795 // does not evenly divide the trip count, no adjustment is necessary since
2796 // there will already be scalar iterations. Note that the minimum iterations
2797 // check ensures that N >= Step.
2798 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2799 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2800 R = Builder.CreateSelect(IsZero, Step, R);
2801 }
2802
2803 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2804
2805 return VectorTripCount;
2806}
2807
2808Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2809 const DataLayout &DL) {
2810 // Verify that V is a vector type with same number of elements as DstVTy.
2811 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2812 unsigned VF = DstFVTy->getNumElements();
2813 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2814 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2814, __PRETTY_FUNCTION__))
;
2815 Type *SrcElemTy = SrcVecTy->getElementType();
2816 Type *DstElemTy = DstFVTy->getElementType();
2817 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2818, __PRETTY_FUNCTION__))
2818 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2818, __PRETTY_FUNCTION__))
;
2819
2820 // Do a direct cast if element types are castable.
2821 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2822 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2823 }
2824 // V cannot be directly casted to desired vector type.
2825 // May happen when V is a floating point vector but DstVTy is a vector of
2826 // pointers or vice-versa. Handle this using a two-step bitcast using an
2827 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2828 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2829, __PRETTY_FUNCTION__))
2829 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2829, __PRETTY_FUNCTION__))
;
2830 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2831, __PRETTY_FUNCTION__))
2831 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2831, __PRETTY_FUNCTION__))
;
2832 Type *IntTy =
2833 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2834 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2835 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2836 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2837}
2838
2839void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2840 BasicBlock *Bypass) {
2841 Value *Count = getOrCreateTripCount(L);
2842 // Reuse existing vector loop preheader for TC checks.
2843 // Note that new preheader block is generated for vector loop.
2844 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2845 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2846
2847 // Generate code to check if the loop's trip count is less than VF * UF, or
2848 // equal to it in case a scalar epilogue is required; this implies that the
2849 // vector trip count is zero. This check also covers the case where adding one
2850 // to the backedge-taken count overflowed leading to an incorrect trip count
2851 // of zero. In this case we will also jump to the scalar loop.
2852 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2853 : ICmpInst::ICMP_ULT;
2854
2855 // If tail is to be folded, vector loop takes care of all iterations.
2856 Value *CheckMinIters = Builder.getFalse();
2857 if (!Cost->foldTailByMasking()) {
2858 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2858, __PRETTY_FUNCTION__))
;
2859 CheckMinIters = Builder.CreateICmp(
2860 P, Count,
2861 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2862 "min.iters.check");
2863 }
2864 // Create new preheader for vector loop.
2865 LoopVectorPreHeader =
2866 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2867 "vector.ph");
2868
2869 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
2870 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
2871 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
;
2872
2873 // Update dominator for Bypass & LoopExit.
2874 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2875 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2876
2877 ReplaceInstWithInst(
2878 TCCheckBlock->getTerminator(),
2879 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2880 LoopBypassBlocks.push_back(TCCheckBlock);
2881}
2882
2883void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2884 // Reuse existing vector loop preheader for SCEV checks.
2885 // Note that new preheader block is generated for vector loop.
2886 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2887
2888 // Generate the code to check that the SCEV assumptions that we made.
2889 // We want the new basic block to start at the first instruction in a
2890 // sequence of instructions that form a check.
2891 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2892 "scev.check");
2893 Value *SCEVCheck = Exp.expandCodeForPredicate(
2894 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2895
2896 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2897 if (C->isZero())
2898 return;
2899
2900 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2901 (OptForSizeBasedOnProfile &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2902 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2903 "Cannot SCEV check stride or overflow when optimizing for size")((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
;
2904
2905 SCEVCheckBlock->setName("vector.scevcheck");
2906 // Create new preheader for vector loop.
2907 LoopVectorPreHeader =
2908 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2909 nullptr, "vector.ph");
2910
2911 // Update dominator only if this is first RT check.
2912 if (LoopBypassBlocks.empty()) {
2913 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2914 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2915 }
2916
2917 ReplaceInstWithInst(
2918 SCEVCheckBlock->getTerminator(),
2919 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2920 LoopBypassBlocks.push_back(SCEVCheckBlock);
2921 AddedSafetyChecks = true;
2922}
2923
2924void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2925 // VPlan-native path does not do any analysis for runtime checks currently.
2926 if (EnableVPlanNativePath)
2927 return;
2928
2929 // Reuse existing vector loop preheader for runtime memory checks.
2930 // Note that new preheader block is generated for vector loop.
2931 BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2932
2933 // Generate the code that checks in runtime if arrays overlap. We put the
2934 // checks into a separate block to make the more common case of few elements
2935 // faster.
2936 auto *LAI = Legal->getLAI();
2937 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2938 if (!RtPtrChecking.Need)
2939 return;
2940
2941 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2942 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
2943 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
2944 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
;
2945 ORE->emit([&]() {
2946 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
2947 L->getStartLoc(), L->getHeader())
2948 << "Code-size may be reduced by not forcing "
2949 "vectorization, or by source-code modifications "
2950 "eliminating the need for runtime checks "
2951 "(e.g., adding 'restrict').";
2952 });
2953 }
2954
2955 MemCheckBlock->setName("vector.memcheck");
2956 // Create new preheader for vector loop.
2957 LoopVectorPreHeader =
2958 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2959 "vector.ph");
2960
2961 auto *CondBranch = cast<BranchInst>(
2962 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2963 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2964 LoopBypassBlocks.push_back(MemCheckBlock);
2965 AddedSafetyChecks = true;
2966
2967 // Update dominator only if this is first RT check.
2968 if (LoopBypassBlocks.empty()) {
2969 DT->changeImmediateDominator(Bypass, MemCheckBlock);
2970 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2971 }
2972
2973 Instruction *FirstCheckInst;
2974 Instruction *MemRuntimeCheck;
2975 std::tie(FirstCheckInst, MemRuntimeCheck) =
2976 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2977 RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2978 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "((MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheck && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2979, __PRETTY_FUNCTION__))
2979 "claimed checks are required")((MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheck && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2979, __PRETTY_FUNCTION__))
;
2980 CondBranch->setCondition(MemRuntimeCheck);
2981
2982 // We currently don't use LoopVersioning for the actual loop cloning but we
2983 // still use it to add the noalias metadata.
2984 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2985 PSE.getSE());
2986 LVer->prepareNoAliasMetadata();
2987}
2988
2989Value *InnerLoopVectorizer::emitTransformedIndex(
2990 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2991 const InductionDescriptor &ID) const {
2992
2993 SCEVExpander Exp(*SE, DL, "induction");
2994 auto Step = ID.getStep();
2995 auto StartValue = ID.getStartValue();
2996 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2997, __PRETTY_FUNCTION__))
2997 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2997, __PRETTY_FUNCTION__))
;
2998
2999 // Note: the IR at this point is broken. We cannot use SE to create any new
3000 // SCEV and then expand it, hoping that SCEV's simplification will give us
3001 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3002 // lead to various SCEV crashes. So all we can do is to use builder and rely
3003 // on InstCombine for future simplifications. Here we handle some trivial
3004 // cases only.
3005 auto CreateAdd = [&B](Value *X, Value *Y) {
3006 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3006, __PRETTY_FUNCTION__))
;
3007 if (auto *CX = dyn_cast<ConstantInt>(X))
3008 if (CX->isZero())
3009 return Y;
3010 if (auto *CY = dyn_cast<ConstantInt>(Y))
3011 if (CY->isZero())
3012 return X;
3013 return B.CreateAdd(X, Y);
3014 };
3015
3016 auto CreateMul = [&B](Value *X, Value *Y) {
3017 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3017, __PRETTY_FUNCTION__))
;
3018 if (auto *CX = dyn_cast<ConstantInt>(X))
3019 if (CX->isOne())
3020 return Y;
3021 if (auto *CY = dyn_cast<ConstantInt>(Y))
3022 if (CY->isOne())
3023 return X;
3024 return B.CreateMul(X, Y);
3025 };
3026
3027 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3028 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3029 // the DomTree is not kept up-to-date for additional blocks generated in the
3030 // vector loop. By using the header as insertion point, we guarantee that the
3031 // expanded instructions dominate all their uses.
3032 auto GetInsertPoint = [this, &B]() {
3033 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3034 if (InsertBB != LoopVectorBody &&
3035 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3036 return LoopVectorBody->getTerminator();
3037 return &*B.GetInsertPoint();
3038 };
3039 switch (ID.getKind()) {
3040 case InductionDescriptor::IK_IntInduction: {
3041 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3042, __PRETTY_FUNCTION__))
3042 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3042, __PRETTY_FUNCTION__))
;
3043 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3044 return B.CreateSub(StartValue, Index);
3045 auto *Offset = CreateMul(
3046 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3047 return CreateAdd(StartValue, Offset);
3048 }
3049 case InductionDescriptor::IK_PtrInduction: {
3050 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3051, __PRETTY_FUNCTION__))
3051 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3051, __PRETTY_FUNCTION__))
;
3052 return B.CreateGEP(
3053 StartValue->getType()->getPointerElementType(), StartValue,
3054 CreateMul(Index,
3055 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3056 }
3057 case InductionDescriptor::IK_FpInduction: {
3058 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3058, __PRETTY_FUNCTION__))
;
3059 auto InductionBinOp = ID.getInductionBinOp();
3060 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3061 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3062 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3063 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
;
3064
3065 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3066
3067 // Floating point operations had to be 'fast' to enable the induction.
3068 FastMathFlags Flags;
3069 Flags.setFast();
3070
3071 Value *MulExp = B.CreateFMul(StepValue, Index);
3072 if (isa<Instruction>(MulExp))
3073 // We have to check, the MulExp may be a constant.
3074 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3075
3076 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3077 "induction");
3078 if (isa<Instruction>(BOp))
3079 cast<Instruction>(BOp)->setFastMathFlags(Flags);
3080
3081 return BOp;
3082 }
3083 case InductionDescriptor::IK_NoInduction:
3084 return nullptr;
3085 }
3086 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3086)
;
3087}
3088
3089Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3090 LoopScalarBody = OrigLoop->getHeader();
3091 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3092 LoopExitBlock = OrigLoop->getExitBlock();
3093 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3093, __PRETTY_FUNCTION__))
;
3094 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3094, __PRETTY_FUNCTION__))
;
3095
3096 LoopMiddleBlock =
3097 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3098 LI, nullptr, Twine(Prefix) + "middle.block");
3099 LoopScalarPreHeader =
3100 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3101 nullptr, Twine(Prefix) + "scalar.ph");
3102 // We intentionally don't let SplitBlock to update LoopInfo since
3103 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3104 // LoopVectorBody is explicitly added to the correct place few lines later.
3105 LoopVectorBody =
3106 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3107 nullptr, nullptr, Twine(Prefix) + "vector.body");
3108
3109 // Update dominator for loop exit.
3110 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3111
3112 // Create and register the new vector loop.
3113 Loop *Lp = LI->AllocateLoop();
3114 Loop *ParentLoop = OrigLoop->getParentLoop();
3115
3116 // Insert the new loop into the loop nest and register the new basic blocks
3117 // before calling any utilities such as SCEV that require valid LoopInfo.
3118 if (ParentLoop) {
3119 ParentLoop->addChildLoop(Lp);
3120 } else {
3121 LI->addTopLevelLoop(Lp);
3122 }
3123 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3124 return Lp;
3125}
3126
3127void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3128 Value *VectorTripCount) {
3129 assert(VectorTripCount && L && "Expected valid arguments")((VectorTripCount && L && "Expected valid arguments"
) ? static_cast<void> (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __PRETTY_FUNCTION__))
;
3130 // We are going to resume the execution of the scalar loop.
3131 // Go over all of the induction variables that we found and fix the
3132 // PHIs that are left in the scalar version of the loop.
3133 // The starting values of PHI nodes depend on the counter of the last
3134 // iteration in the vectorized loop.
3135 // If we come from a bypass edge then we need to start from the original
3136 // start value.
3137 for (auto &InductionEntry : Legal->getInductionVars()) {
3138 PHINode *OrigPhi = InductionEntry.first;
3139 InductionDescriptor II = InductionEntry.second;
3140
3141 // Create phi nodes to merge from the backedge-taken check block.
3142 PHINode *BCResumeVal =
3143 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3144 LoopScalarPreHeader->getTerminator());
3145 // Copy original phi DL over to the new one.
3146 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3147 Value *&EndValue = IVEndValues[OrigPhi];
3148 if (OrigPhi == OldInduction) {
3149 // We know what the end value is.
3150 EndValue = VectorTripCount;
3151 } else {
3152 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3153 Type *StepType = II.getStep()->getType();
3154 Instruction::CastOps CastOp =
3155 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3156 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3157 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3158 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3159 EndValue->setName("ind.end");
3160 }
3161
3162 // The new PHI merges the original incoming value, in case of a bypass,
3163 // or the value at the end of the vectorized loop.
3164 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3165
3166 // Fix the scalar body counter (PHI node).
3167 // The old induction's phi node in the scalar body needs the truncated
3168 // value.
3169 for (BasicBlock *BB : LoopBypassBlocks)
3170 BCResumeVal->addIncoming(II.getStartValue(), BB);
3171 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3172 }
3173}
3174
3175BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3176 MDNode *OrigLoopID) {
3177 assert(L && "Expected valid loop.")((L && "Expected valid loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3177, __PRETTY_FUNCTION__))
;
3178
3179 // The trip counts should be cached by now.
3180 Value *Count = getOrCreateTripCount(L);
3181 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3182
3183 // We need the OrigLoop (scalar loop part) latch terminator to help
3184 // produce correct debug info for the middle block BB instructions.
3185 // The legality check stage guarantees that the loop will have a single
3186 // latch.
3187 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3188, __PRETTY_FUNCTION__))
3188 "Scalar loop latch terminator isn't a branch")((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3188, __PRETTY_FUNCTION__))
;
3189 BranchInst *ScalarLatchBr =
3190 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3191
3192 // Add a check in the middle block to see if we have completed
3193 // all of the iterations in the first vector loop.
3194 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3195 // If tail is to be folded, we know we don't need to run the remainder.
3196 Value *CmpN = Builder.getTrue();
3197 if (!Cost->foldTailByMasking()) {
3198 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3199 VectorTripCount, "cmp.n",
3200 LoopMiddleBlock->getTerminator());
3201
3202 // Here we use the same DebugLoc as the scalar loop latch branch instead
3203 // of the corresponding compare because they may have ended up with
3204 // different line numbers and we want to avoid awkward line stepping while
3205 // debugging. Eg. if the compare has got a line number inside the loop.
3206 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3207 }
3208
3209 BranchInst *BrInst =
3210 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3211 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3212 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3213
3214 // Get ready to start creating new instructions into the vectorized body.
3215 assert(LoopVectorPreHeader == L->getLoopPreheader() &&((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3216, __PRETTY_FUNCTION__))
3216 "Inconsistent vector loop preheader")((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3216, __PRETTY_FUNCTION__))
;
3217 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3218
3219 Optional<MDNode *> VectorizedLoopID =
3220 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3221 LLVMLoopVectorizeFollowupVectorized});
3222 if (VectorizedLoopID.hasValue()) {
3223 L->setLoopID(VectorizedLoopID.getValue());
3224
3225 // Do not setAlreadyVectorized if loop attributes have been defined
3226 // explicitly.
3227 return LoopVectorPreHeader;
3228 }
3229
3230 // Keep all loop hints from the original loop on the vector loop (we'll
3231 // replace the vectorizer-specific hints below).
3232 if (MDNode *LID = OrigLoop->getLoopID())
3233 L->setLoopID(LID);
3234
3235 LoopVectorizeHints Hints(L, true, *ORE);
3236 Hints.setAlreadyVectorized();
3237
3238#ifdef EXPENSIVE_CHECKS
3239 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3239, __PRETTY_FUNCTION__))
;
3240 LI->verify(*DT);
3241#endif
3242
3243 return LoopVectorPreHeader;
3244}
3245
3246BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3247 /*
3248 In this function we generate a new loop. The new loop will contain
3249 the vectorized instructions while the old loop will continue to run the
3250 scalar remainder.
3251
3252 [ ] <-- loop iteration number check.
3253 / |
3254 / v
3255 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3256 | / |
3257 | / v
3258 || [ ] <-- vector pre header.
3259 |/ |
3260 | v
3261 | [ ] \
3262 | [ ]_| <-- vector loop.
3263 | |
3264 | v
3265 | -[ ] <--- middle-block.
3266 | / |
3267 | / v
3268 -|- >[ ] <--- new preheader.
3269 | |
3270 | v
3271 | [ ] \
3272 | [ ]_| <-- old scalar loop to handle remainder.
3273 \ |
3274 \ v
3275 >[ ] <-- exit block.
3276 ...
3277 */
3278
3279 // Get the metadata of the original loop before it gets modified.
3280 MDNode *OrigLoopID = OrigLoop->getLoopID();
3281
3282 // Create an empty vector loop, and prepare basic blocks for the runtime
3283 // checks.
3284 Loop *Lp = createVectorLoopSkeleton("");
3285
3286 // Now, compare the new count to zero. If it is zero skip the vector loop and
3287 // jump to the scalar loop. This check also covers the case where the
3288 // backedge-taken count is uint##_max: adding one to it will overflow leading
3289 // to an incorrect trip count of zero. In this (rare) case we will also jump
3290 // to the scalar loop.
3291 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3292
3293 // Generate the code to check any assumptions that we've made for SCEV
3294 // expressions.
3295 emitSCEVChecks(Lp, LoopScalarPreHeader);
3296
3297 // Generate the code that checks in runtime if arrays overlap. We put the
3298 // checks into a separate block to make the more common case of few elements
3299 // faster.
3300 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3301
3302 // Some loops have a single integer induction variable, while other loops
3303 // don't. One example is c++ iterators that often have multiple pointer
3304 // induction variables. In the code below we also support a case where we
3305 // don't have a single induction variable.
3306 //
3307 // We try to obtain an induction variable from the original loop as hard
3308 // as possible. However if we don't find one that:
3309 // - is an integer
3310 // - counts from zero, stepping by one
3311 // - is the size of the widest induction variable type
3312 // then we create a new one.
3313 OldInduction = Legal->getPrimaryInduction();
3314 Type *IdxTy = Legal->getWidestInductionType();
3315 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3316 // The loop step is equal to the vectorization factor (num of SIMD elements)
3317 // times the unroll factor (num of SIMD instructions).
3318 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3318, __PRETTY_FUNCTION__))
;
3319 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3320 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3321 Induction =
3322 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3323 getDebugLocFromInstOrOperands(OldInduction));
3324
3325 // Emit phis for the new starting index of the scalar loop.
3326 createInductionResumeValues(Lp, CountRoundDown);
3327
3328 return completeLoopSkeleton(Lp, OrigLoopID);
3329}
3330
3331// Fix up external users of the induction variable. At this point, we are
3332// in LCSSA form, with all external PHIs that use the IV having one input value,
3333// coming from the remainder loop. We need those PHIs to also have a correct
3334// value for the IV when arriving directly from the middle block.
3335void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3336 const InductionDescriptor &II,
3337 Value *CountRoundDown, Value *EndValue,
3338 BasicBlock *MiddleBlock) {
3339 // There are two kinds of external IV usages - those that use the value
3340 // computed in the last iteration (the PHI) and those that use the penultimate
3341 // value (the value that feeds into the phi from the loop latch).
3342 // We allow both, but they, obviously, have different values.
3343
3344 assert(OrigLoop->getExitBlock() && "Expected a single exit block")((OrigLoop->getExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3344, __PRETTY_FUNCTION__))
;
3345
3346 DenseMap<Value *, Value *> MissingVals;
3347
3348 // An external user of the last iteration's value should see the value that
3349 // the remainder loop uses to initialize its own IV.
3350 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3351 for (User *U : PostInc->users()) {
3352 Instruction *UI = cast<Instruction>(U);
3353 if (!OrigLoop->contains(UI)) {
3354 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3354, __PRETTY_FUNCTION__))
;
3355 MissingVals[UI] = EndValue;
3356 }
3357 }
3358
3359 // An external user of the penultimate value need to see EndValue - Step.
3360 // The simplest way to get this is to recompute it from the constituent SCEVs,
3361 // that is Start + (Step * (CRD - 1)).
3362 for (User *U : OrigPhi->users()) {
3363 auto *UI = cast<Instruction>(U);
3364 if (!OrigLoop->contains(UI)) {
3365 const DataLayout &DL =
3366 OrigLoop->getHeader()->getModule()->getDataLayout();
3367 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3367, __PRETTY_FUNCTION__))
;
3368
3369 IRBuilder<> B(MiddleBlock->getTerminator());
3370 Value *CountMinusOne = B.CreateSub(
3371 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3372 Value *CMO =
3373 !II.getStep()->getType()->isIntegerTy()
3374 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3375 II.getStep()->getType())
3376 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3377 CMO->setName("cast.cmo");
3378 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3379 Escape->setName("ind.escape");
3380 MissingVals[UI] = Escape;
3381 }
3382 }
3383
3384 for (auto &I : MissingVals) {
3385 PHINode *PHI = cast<PHINode>(I.first);
3386 // One corner case we have to handle is two IVs "chasing" each-other,
3387 // that is %IV2 = phi [...], [ %IV1, %latch ]
3388 // In this case, if IV1 has an external use, we need to avoid adding both
3389 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3390 // don't already have an incoming value for the middle block.
3391 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3392 PHI->addIncoming(I.second, MiddleBlock);
3393 }
3394}
3395
3396namespace {
3397
3398struct CSEDenseMapInfo {
3399 static bool canHandle(const Instruction *I) {
3400 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3401 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3402 }
3403
3404 static inline Instruction *getEmptyKey() {
3405 return DenseMapInfo<Instruction *>::getEmptyKey();
3406 }
3407
3408 static inline Instruction *getTombstoneKey() {
3409 return DenseMapInfo<Instruction *>::getTombstoneKey();
3410 }
3411
3412 static unsigned getHashValue(const Instruction *I) {
3413 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3413, __PRETTY_FUNCTION__))
;
3414 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3415 I->value_op_end()));
3416 }
3417
3418 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3419 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3420 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3421 return LHS == RHS;
3422 return LHS->isIdenticalTo(RHS);
3423 }
3424};
3425
3426} // end anonymous namespace
3427
3428///Perform cse of induction variable instructions.
3429static void cse(BasicBlock *BB) {
3430 // Perform simple cse.
3431 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3432 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3433 Instruction *In = &*I++;
3434
3435 if (!CSEDenseMapInfo::canHandle(In))
3436 continue;
3437
3438 // Check if we can replace this instruction with any of the
3439 // visited instructions.
3440 if (Instruction *V = CSEMap.lookup(In)) {
3441 In->replaceAllUsesWith(V);
3442 In->eraseFromParent();
3443 continue;
3444 }
3445
3446 CSEMap[In] = In;
3447 }
3448}
3449
3450unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3451 ElementCount VF,
3452 bool &NeedToScalarize) {
3453 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__))
;
3454 Function *F = CI->getCalledFunction();
3455 Type *ScalarRetTy = CI->getType();
3456 SmallVector<Type *, 4> Tys, ScalarTys;
3457 for (auto &ArgOp : CI->arg_operands())
3458 ScalarTys.push_back(ArgOp->getType());
3459
3460 // Estimate cost of scalarized vector call. The source operands are assumed
3461 // to be vectors, so we need to extract individual elements from there,
3462 // execute VF scalar calls, and then gather the result into the vector return
3463 // value.
3464 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3465 TTI::TCK_RecipThroughput);
3466 if (VF.isScalar())
3467 return ScalarCallCost;
3468
3469 // Compute corresponding vector type for return value and arguments.
3470 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3471 for (Type *ScalarTy : ScalarTys)
3472 Tys.push_back(ToVectorTy(ScalarTy, VF));
3473
3474 // Compute costs of unpacking argument values for the scalar calls and
3475 // packing the return values to a vector.
3476 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3477
3478 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3479
3480 // If we can't emit a vector call for this function, then the currently found
3481 // cost is the cost we need to return.
3482 NeedToScalarize = true;
3483 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3484 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3485
3486 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3487 return Cost;
3488
3489 // If the corresponding vector cost is cheaper, return its cost.
3490 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3491 TTI::TCK_RecipThroughput);
3492 if (VectorCallCost < Cost) {
3493 NeedToScalarize = false;
3494 return VectorCallCost;
3495 }
3496 return Cost;
3497}
3498
3499unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3500 ElementCount VF) {
3501 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3502 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3502, __PRETTY_FUNCTION__))
;
3503
3504 IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3505 return TTI.getIntrinsicInstrCost(CostAttrs,
3506 TargetTransformInfo::TCK_RecipThroughput);
3507}
3508
3509static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3510 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3511 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3512 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3513}
3514
3515static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3516 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3517 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3518 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3519}
3520
3521void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3522 // For every instruction `I` in MinBWs, truncate the operands, create a
3523 // truncated version of `I` and reextend its result. InstCombine runs
3524 // later and will remove any ext/trunc pairs.
3525 SmallPtrSet<Value *, 4> Erased;
3526 for (const auto &KV : Cost->getMinimalBitwidths()) {
3527 // If the value wasn't vectorized, we must maintain the original scalar
3528 // type. The absence of the value from VectorLoopValueMap indicates that it
3529 // wasn't vectorized.
3530 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3531 continue;
3532 for (unsigned Part = 0; Part < UF; ++Part) {
3533 Value *I = getOrCreateVectorValue(KV.first, Part);
3534 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3535 continue;
3536 Type *OriginalTy = I->getType();
3537 Type *ScalarTruncatedTy =
3538 IntegerType::get(OriginalTy->getContext(), KV.second);
3539 auto *TruncatedTy = FixedVectorType::get(
3540 ScalarTruncatedTy,
3541 cast<FixedVectorType>(OriginalTy)->getNumElements());
3542 if (TruncatedTy == OriginalTy)
3543 continue;
3544
3545 IRBuilder<> B(cast<Instruction>(I));
3546 auto ShrinkOperand = [&](Value *V) -> Value * {
3547 if (auto *ZI = dyn_cast<ZExtInst>(V))
3548 if (ZI->getSrcTy() == TruncatedTy)
3549 return ZI->getOperand(0);
3550 return B.CreateZExtOrTrunc(V, TruncatedTy);
3551 };
3552
3553 // The actual instruction modification depends on the instruction type,
3554 // unfortunately.
3555 Value *NewI = nullptr;
3556 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3557 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3558 ShrinkOperand(BO->getOperand(1)));
3559
3560 // Any wrapping introduced by shrinking this operation shouldn't be
3561 // considered undefined behavior. So, we can't unconditionally copy
3562 // arithmetic wrapping flags to NewI.
3563 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3564 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3565 NewI =
3566 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3567 ShrinkOperand(CI->getOperand(1)));
3568 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3569 NewI = B.CreateSelect(SI->getCondition(),
3570 ShrinkOperand(SI->getTrueValue()),
3571 ShrinkOperand(SI->getFalseValue()));
3572 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3573 switch (CI->getOpcode()) {
3574 default:
3575 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3575)
;
3576 case Instruction::Trunc:
3577 NewI = ShrinkOperand(CI->getOperand(0));
3578 break;
3579 case Instruction::SExt:
3580 NewI = B.CreateSExtOrTrunc(
3581 CI->getOperand(0),
3582 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3583 break;
3584 case Instruction::ZExt:
3585 NewI = B.CreateZExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 }
3590 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3591 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3592 ->getNumElements();
3593 auto *O0 = B.CreateZExtOrTrunc(
3594 SI->getOperand(0),
3595 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3596 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3597 ->getNumElements();
3598 auto *O1 = B.CreateZExtOrTrunc(
3599 SI->getOperand(1),
3600 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3601
3602 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3603 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3604 // Don't do anything with the operands, just extend the result.
3605 continue;
3606 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3607 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3608 ->getNumElements();
3609 auto *O0 = B.CreateZExtOrTrunc(
3610 IE->getOperand(0),
3611 FixedVectorType::get(ScalarTruncatedTy, Elements));
3612 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3613 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3614 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3615 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3616 ->getNumElements();
3617 auto *O0 = B.CreateZExtOrTrunc(
3618 EE->getOperand(0),
3619 FixedVectorType::get(ScalarTruncatedTy, Elements));
3620 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3621 } else {
3622 // If we don't know what to do, be conservative and don't do anything.
3623 continue;
3624 }
3625
3626 // Lastly, extend the result.
3627 NewI->takeName(cast<Instruction>(I));
3628 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3629 I->replaceAllUsesWith(Res);
3630 cast<Instruction>(I)->eraseFromParent();
3631 Erased.insert(I);
3632 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3633 }
3634 }
3635
3636 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3637 for (const auto &KV : Cost->getMinimalBitwidths()) {
3638 // If the value wasn't vectorized, we must maintain the original scalar
3639 // type. The absence of the value from VectorLoopValueMap indicates that it
3640 // wasn't vectorized.
3641 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3642 continue;
3643 for (unsigned Part = 0; Part < UF; ++Part) {
3644 Value *I = getOrCreateVectorValue(KV.first, Part);
3645 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3646 if (Inst && Inst->use_empty()) {
3647 Value *NewI = Inst->getOperand(0);
3648 Inst->eraseFromParent();
3649 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3650 }
3651 }
3652 }
3653}
3654
3655void InnerLoopVectorizer::fixVectorizedLoop() {
3656 // Insert truncates and extends for any truncated instructions as hints to
3657 // InstCombine.
3658 if (VF.isVector())
3659 truncateToMinimalBitwidths();
3660
3661 // Fix widened non-induction PHIs by setting up the PHI operands.
3662 if (OrigPHIsToFix.size()) {
3663 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3664, __PRETTY_FUNCTION__))
3664 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3664, __PRETTY_FUNCTION__))
;
3665 fixNonInductionPHIs();
3666 }
3667
3668 // At this point every instruction in the original loop is widened to a
3669 // vector form. Now we need to fix the recurrences in the loop. These PHI
3670 // nodes are currently empty because we did not want to introduce cycles.
3671 // This is the second stage of vectorizing recurrences.
3672 fixCrossIterationPHIs();
3673
3674 // Forget the original basic block.
3675 PSE.getSE()->forgetLoop(OrigLoop);
3676
3677 // Fix-up external users of the induction variables.
3678 for (auto &Entry : Legal->getInductionVars())
3679 fixupIVUsers(Entry.first, Entry.second,
3680 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3681 IVEndValues[Entry.first], LoopMiddleBlock);
3682
3683 fixLCSSAPHIs();
3684 for (Instruction *PI : PredicatedInstructions)
3685 sinkScalarOperands(&*PI);
3686
3687 // Remove redundant induction instructions.
3688 cse(LoopVectorBody);
3689
3690 // Set/update profile weights for the vector and remainder loops as original
3691 // loop iterations are now distributed among them. Note that original loop
3692 // represented by LoopScalarBody becomes remainder loop after vectorization.
3693 //
3694 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3695 // end up getting slightly roughened result but that should be OK since
3696 // profile is not inherently precise anyway. Note also possible bypass of
3697 // vector code caused by legality checks is ignored, assigning all the weight
3698 // to the vector loop, optimistically.
3699 assert(!VF.isScalable() &&((!VF.isScalable() && "cannot use scalable ElementCount to determine unroll factor"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot use scalable ElementCount to determine unroll factor\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3700, __PRETTY_FUNCTION__))
3700 "cannot use scalable ElementCount to determine unroll factor")((!VF.isScalable() && "cannot use scalable ElementCount to determine unroll factor"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot use scalable ElementCount to determine unroll factor\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3700, __PRETTY_FUNCTION__))
;
3701 setProfileInfoAfterUnrolling(
3702 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3703 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3704}
3705
3706void InnerLoopVectorizer::fixCrossIterationPHIs() {
3707 // In order to support recurrences we need to be able to vectorize Phi nodes.
3708 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3709 // stage #2: We now need to fix the recurrences by adding incoming edges to
3710 // the currently empty PHI nodes. At this point every instruction in the
3711 // original loop is widened to a vector form so we can use them to construct
3712 // the incoming edges.
3713 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3714 // Handle first-order recurrences and reductions that need to be fixed.
3715 if (Legal->isFirstOrderRecurrence(&Phi))
3716 fixFirstOrderRecurrence(&Phi);
3717 else if (Legal->isReductionVariable(&Phi))
3718 fixReduction(&Phi);
3719 }
3720}
3721
3722void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3723 // This is the second phase of vectorizing first-order recurrences. An
3724 // overview of the transformation is described below. Suppose we have the
3725 // following loop.
3726 //
3727 // for (int i = 0; i < n; ++i)
3728 // b[i] = a[i] - a[i - 1];
3729 //
3730 // There is a first-order recurrence on "a". For this loop, the shorthand
3731 // scalar IR looks like:
3732 //
3733 // scalar.ph:
3734 // s_init = a[-1]
3735 // br scalar.body
3736 //
3737 // scalar.body:
3738 // i = phi [0, scalar.ph], [i+1, scalar.body]
3739 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3740 // s2 = a[i]
3741 // b[i] = s2 - s1
3742 // br cond, scalar.body, ...
3743 //
3744 // In this example, s1 is a recurrence because it's value depends on the
3745 // previous iteration. In the first phase of vectorization, we created a
3746 // temporary value for s1. We now complete the vectorization and produce the
3747 // shorthand vector IR shown below (for VF = 4, UF = 1).
3748 //
3749 // vector.ph:
3750 // v_init = vector(..., ..., ..., a[-1])
3751 // br vector.body
3752 //
3753 // vector.body
3754 // i = phi [0, vector.ph], [i+4, vector.body]
3755 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3756 // v2 = a[i, i+1, i+2, i+3];
3757 // v3 = vector(v1(3), v2(0, 1, 2))
3758 // b[i, i+1, i+2, i+3] = v2 - v3
3759 // br cond, vector.body, middle.block
3760 //
3761 // middle.block:
3762 // x = v2(3)
3763 // br scalar.ph
3764 //
3765 // scalar.ph:
3766 // s_init = phi [x, middle.block], [a[-1], otherwise]
3767 // br scalar.body
3768 //
3769 // After execution completes the vector loop, we extract the next value of
3770 // the recurrence (x) to use as the initial value in the scalar loop.
3771
3772 // Get the original loop preheader and single loop latch.
3773 auto *Preheader = OrigLoop->getLoopPreheader();
3774 auto *Latch = OrigLoop->getLoopLatch();
3775
3776 // Get the initial and previous values of the scalar recurrence.
3777 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3778 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3779
3780 // Create a vector from the initial value.
3781 auto *VectorInit = ScalarInit;
3782 if (VF.isVector()) {
3783 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3784 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3784, __PRETTY_FUNCTION__))
;
3785 VectorInit = Builder.CreateInsertElement(
3786 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3787 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3788 }
3789
3790 // We constructed a temporary phi node in the first phase of vectorization.
3791 // This phi node will eventually be deleted.
3792 Builder.SetInsertPoint(
3793 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3794
3795 // Create a phi node for the new recurrence. The current value will either be
3796 // the initial value inserted into a vector or loop-varying vector value.
3797 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3798 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3799
3800 // Get the vectorized previous value of the last part UF - 1. It appears last
3801 // among all unrolled iterations, due to the order of their construction.
3802 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3803
3804 // Find and set the insertion point after the previous value if it is an
3805 // instruction.
3806 BasicBlock::iterator InsertPt;
3807 // Note that the previous value may have been constant-folded so it is not
3808 // guaranteed to be an instruction in the vector loop.
3809 // FIXME: Loop invariant values do not form recurrences. We should deal with
3810 // them earlier.
3811 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3812 InsertPt = LoopVectorBody->getFirstInsertionPt();
3813 else {
3814 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3815 if (isa<PHINode>(PreviousLastPart))
3816 // If the previous value is a phi node, we should insert after all the phi
3817 // nodes in the block containing the PHI to avoid breaking basic block
3818 // verification. Note that the basic block may be different to
3819 // LoopVectorBody, in case we predicate the loop.
3820 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3821 else
3822 InsertPt = ++PreviousInst->getIterator();
3823 }
3824 Builder.SetInsertPoint(&*InsertPt);
3825
3826 // We will construct a vector for the recurrence by combining the values for
3827 // the current and previous iterations. This is the required shuffle mask.
3828 assert(!VF.isScalable())((!VF.isScalable()) ? static_cast<void> (0) : __assert_fail
("!VF.isScalable()", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3828, __PRETTY_FUNCTION__))
;
3829 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3830 ShuffleMask[0] = VF.getKnownMinValue() - 1;
3831 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3832 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3833
3834 // The vector from which to take the initial value for the current iteration
3835 // (actual or unrolled). Initially, this is the vector phi node.
3836 Value *Incoming = VecPhi;
3837
3838 // Shuffle the current and previous vector and update the vector parts.
3839 for (unsigned Part = 0; Part < UF; ++Part) {
3840 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3841 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3842 auto *Shuffle =
3843 VF.isVector()
3844 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3845 : Incoming;
3846 PhiPart->replaceAllUsesWith(Shuffle);
3847 cast<Instruction>(PhiPart)->eraseFromParent();
3848 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3849 Incoming = PreviousPart;
3850 }
3851
3852 // Fix the latch value of the new recurrence in the vector loop.
3853 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3854
3855 // Extract the last vector element in the middle block. This will be the
3856 // initial value for the recurrence when jumping to the scalar loop.
3857 auto *ExtractForScalar = Incoming;
3858 if (VF.isVector()) {
3859 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3860 ExtractForScalar = Builder.CreateExtractElement(
3861 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3862 "vector.recur.extract");
3863 }
3864 // Extract the second last element in the middle block if the
3865 // Phi is used outside the loop. We need to extract the phi itself
3866 // and not the last element (the phi update in the current iteration). This
3867 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3868 // when the scalar loop is not run at all.
3869 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3870 if (VF.isVector())
3871 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3872 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3873 "vector.recur.extract.for.phi");
3874 // When loop is unrolled without vectorizing, initialize
3875 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3876 // `Incoming`. This is analogous to the vectorized case above: extracting the
3877 // second last element when VF > 1.
3878 else if (UF > 1)
3879 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3880
3881 // Fix the initial value of the original recurrence in the scalar loop.
3882 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3883 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3884 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3885 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3886 Start->addIncoming(Incoming, BB);
3887 }
3888
3889 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3890 Phi->setName("scalar.recur");
3891
3892 // Finally, fix users of the recurrence outside the loop. The users will need
3893 // either the last value of the scalar recurrence or the last value of the
3894 // vector recurrence we extracted in the middle block. Since the loop is in
3895 // LCSSA form, we just need to find all the phi nodes for the original scalar
3896 // recurrence in the exit block, and then add an edge for the middle block.
3897 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3898 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3899 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3900 }
3901 }
3902}
3903
3904void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3905 Constant *Zero = Builder.getInt32(0);
3906
3907 // Get it's reduction variable descriptor.
3908 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3909, __PRETTY_FUNCTION__))
3909 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3909, __PRETTY_FUNCTION__))
;
3910 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3911
3912 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3913 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3914 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3915 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3916 RdxDesc.getMinMaxRecurrenceKind();
3917 setDebugLocFromInst(Builder, ReductionStartValue);
3918 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3919
3920 // We need to generate a reduction vector from the incoming scalar.
3921 // To do so, we need to generate the 'identity' vector and override
3922 // one of the elements with the incoming scalar reduction. We need
3923 // to do it in the vector-loop preheader.
3924 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3925
3926 // This is the vector-clone of the value that leaves the loop.
3927 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3928
3929 // Find the reduction identity variable. Zero for addition, or, xor,
3930 // one for multiplication, -1 for And.
3931 Value *Identity;
3932 Value *VectorStart;
3933 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3934 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3935 // MinMax reduction have the start value as their identify.
3936 if (VF == 1 || IsInLoopReductionPhi) {
3937 VectorStart = Identity = ReductionStartValue;
3938 } else {
3939 VectorStart = Identity =
3940 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3941 }
3942 } else {
3943 // Handle other reduction kinds:
3944 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3945 RK, VecTy->getScalarType());
3946 if (VF == 1 || IsInLoopReductionPhi) {
3947 Identity = Iden;
3948 // This vector is the Identity vector where the first element is the
3949 // incoming scalar reduction.
3950 VectorStart = ReductionStartValue;
3951 } else {
3952 Identity = ConstantVector::getSplat(VF, Iden);
3953
3954 // This vector is the Identity vector where the first element is the
3955 // incoming scalar reduction.
3956 VectorStart =
3957 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3958 }
3959 }
3960
3961 // Wrap flags are in general invalid after vectorization, clear them.
3962 clearReductionWrapFlags(RdxDesc);
3963
3964 // Fix the vector-loop phi.
3965
3966 // Reductions do not have to start at zero. They can start with
3967 // any loop invariant values.
3968 BasicBlock *Latch = OrigLoop->getLoopLatch();
3969 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3970
3971 for (unsigned Part = 0; Part < UF; ++Part) {
3972 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3973 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3974 // Make sure to add the reduction start value only to the
3975 // first unroll part.
3976 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3977 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3978 cast<PHINode>(VecRdxPhi)
3979 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3980 }
3981
3982 // Before each round, move the insertion point right between
3983 // the PHIs and the values we are going to write.
3984 // This allows us to write both PHINodes and the extractelement
3985 // instructions.
3986 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3987
3988 setDebugLocFromInst(Builder, LoopExitInst);
3989
3990 // If tail is folded by masking, the vector value to leave the loop should be
3991 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3992 // instead of the former.
3993 if (Cost->foldTailByMasking()) {
3994 for (unsigned Part = 0; Part < UF; ++Part) {
3995 Value *VecLoopExitInst =
3996 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3997 Value *Sel = nullptr;
3998 for (User *U : VecLoopExitInst->users()) {
3999 if (isa<SelectInst>(U)) {
4000 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4000, __PRETTY_FUNCTION__))
;
4001 Sel = U;
4002 } else
4003 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4003, __PRETTY_FUNCTION__))
;
4004 }
4005 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4005, __PRETTY_FUNCTION__))
;
4006 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4007
4008 // If the target can create a predicated operator for the reduction at no
4009 // extra cost in the loop (for example a predicated vadd), it can be
4010 // cheaper for the select to remain in the loop than be sunk out of it,
4011 // and so use the select value for the phi instead of the old
4012 // LoopExitValue.
4013 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4014 if (PreferPredicatedReductionSelect ||
4015 TTI->preferPredicatedReductionSelect(
4016 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4017 Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4018 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4019 VecRdxPhi->setIncomingValueForBlock(
4020 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4021 }
4022 }
4023 }
4024
4025 // If the vector reduction can be performed in a smaller type, we truncate
4026 // then extend the loop exit value to enable InstCombine to evaluate the
4027 // entire expression in the smaller type.
4028 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4029 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")((!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? static_cast<void> (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4029, __PRETTY_FUNCTION__))
;
4030 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4030, __PRETTY_FUNCTION__))
;
4031 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4032 Builder.SetInsertPoint(
4033 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4034 VectorParts RdxParts(UF);
4035 for (unsigned Part = 0; Part < UF; ++Part) {
4036 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4037 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4038 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4039 : Builder.CreateZExt(Trunc, VecTy);
4040 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4041 UI != RdxParts[Part]->user_end();)
4042 if (*UI != Trunc) {
4043 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4044 RdxParts[Part] = Extnd;
4045 } else {
4046 ++UI;
4047 }
4048 }
4049 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4050 for (unsigned Part = 0; Part < UF; ++Part) {
4051 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4052 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4053 }
4054 }
4055
4056 // Reduce all of the unrolled parts into a single vector.
4057 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4058 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4059
4060 // The middle block terminator has already been assigned a DebugLoc here (the
4061 // OrigLoop's single latch terminator). We want the whole middle block to
4062 // appear to execute on this line because: (a) it is all compiler generated,
4063 // (b) these instructions are always executed after evaluating the latch
4064 // conditional branch, and (c) other passes may add new predecessors which
4065 // terminate on this line. This is the easiest way to ensure we don't
4066 // accidentally cause an extra step back into the loop while debugging.
4067 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4068 for (unsigned Part = 1; Part < UF; ++Part) {
4069 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4070 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4071 // Floating point operations had to be 'fast' to enable the reduction.
4072 ReducedPartRdx = addFastMathFlag(
4073 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4074 ReducedPartRdx, "bin.rdx"),
4075 RdxDesc.getFastMathFlags());
4076 else
4077 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4078 RdxPart);
4079 }
4080
4081 // Create the reduction after the loop. Note that inloop reductions create the
4082 // target reduction in the loop using a Reduction recipe.
4083 if (VF.isVector() && !IsInLoopReductionPhi) {
4084 bool NoNaN = Legal->hasFunNoNaNAttr();
4085 ReducedPartRdx =
4086 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4087 // If the reduction can be performed in a smaller type, we need to extend
4088 // the reduction to the wider type before we branch to the original loop.
4089 if (Phi->getType() != RdxDesc.getRecurrenceType())
4090 ReducedPartRdx =
4091 RdxDesc.isSigned()
4092 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4093 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4094 }
4095
4096 // Create a phi node that merges control-flow from the backedge-taken check
4097 // block and the middle block.
4098 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4099 LoopScalarPreHeader->getTerminator());
4100 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4101 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4102 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4103
4104 // Now, we need to fix the users of the reduction variable
4105 // inside and outside of the scalar remainder loop.
4106 // We know that the loop is in LCSSA form. We need to update the
4107 // PHI nodes in the exit blocks.
4108 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4109 // All PHINodes need to have a single entry edge, or two if
4110 // we already fixed them.
4111 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4111, __PRETTY_FUNCTION__))
;
4112
4113 // We found a reduction value exit-PHI. Update it with the
4114 // incoming bypass edge.
4115 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4116 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4117 } // end of the LCSSA phi scan.
4118
4119 // Fix the scalar loop reduction variable with the incoming reduction sum
4120 // from the vector body and from the backedge value.
4121 int IncomingEdgeBlockIdx =
4122 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4123 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4123, __PRETTY_FUNCTION__))
;
4124 // Pick the other block.
4125 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4126 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4127 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4128}
4129
4130void InnerLoopVectorizer::clearReductionWrapFlags(
4131 RecurrenceDescriptor &RdxDesc) {
4132 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4133 if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4134 RK != RecurrenceDescriptor::RK_IntegerMult)
4135 return;
4136
4137 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4138 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4138, __PRETTY_FUNCTION__))
;
4139 SmallVector<Instruction *, 8> Worklist;
4140 SmallPtrSet<Instruction *, 8> Visited;
4141 Worklist.push_back(LoopExitInstr);
4142 Visited.insert(LoopExitInstr);
4143
4144 while (!Worklist.empty()) {
4145 Instruction *Cur = Worklist.pop_back_val();
4146 if (isa<OverflowingBinaryOperator>(Cur))
4147 for (unsigned Part = 0; Part < UF; ++Part) {
4148 Value *V = getOrCreateVectorValue(Cur, Part);
4149 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4150 }
4151
4152 for (User *U : Cur->users()) {
4153 Instruction *UI = cast<Instruction>(U);
4154 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4155 Visited.insert(UI).second)
4156 Worklist.push_back(UI);
4157 }
4158 }
4159}
4160
4161void InnerLoopVectorizer::fixLCSSAPHIs() {
4162 assert(!VF.isScalable() && "the code below assumes fixed width vectors")((!VF.isScalable() && "the code below assumes fixed width vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes fixed width vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4162, __PRETTY_FUNCTION__))
;
4163 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4164 if (LCSSAPhi.getNumIncomingValues() == 1) {
4165 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4166 // Non-instruction incoming values will have only one value.
4167 unsigned LastLane = 0;
4168 if (isa<Instruction>(IncomingValue))
4169 LastLane = Cost->isUniformAfterVectorization(
4170 cast<Instruction>(IncomingValue), VF)
4171 ? 0
4172 : VF.getKnownMinValue() - 1;
4173 // Can be a loop invariant incoming value or the last scalar value to be
4174 // extracted from the vectorized loop.
4175 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4176 Value *lastIncomingValue =
4177 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4178 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4179 }
4180 }
4181}
4182
4183void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4184 // The basic block and loop containing the predicated instruction.
4185 auto *PredBB = PredInst->getParent();
4186 auto *VectorLoop = LI->getLoopFor(PredBB);
4187
4188 // Initialize a worklist with the operands of the predicated instruction.
4189 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4190
4191 // Holds instructions that we need to analyze again. An instruction may be
4192 // reanalyzed if we don't yet know if we can sink it or not.
4193 SmallVector<Instruction *, 8> InstsToReanalyze;
4194
4195 // Returns true if a given use occurs in the predicated block. Phi nodes use
4196 // their operands in their corresponding predecessor blocks.
4197 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4198 auto *I = cast<Instruction>(U.getUser());
4199 BasicBlock *BB = I->getParent();
4200 if (auto *Phi = dyn_cast<PHINode>(I))
4201 BB = Phi->getIncomingBlock(
4202 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4203 return BB == PredBB;
4204 };
4205
4206 // Iteratively sink the scalarized operands of the predicated instruction
4207 // into the block we created for it. When an instruction is sunk, it's
4208 // operands are then added to the worklist. The algorithm ends after one pass
4209 // through the worklist doesn't sink a single instruction.
4210 bool Changed;
4211 do {
4212 // Add the instructions that need to be reanalyzed to the worklist, and
4213 // reset the changed indicator.
4214 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4215 InstsToReanalyze.clear();
4216 Changed = false;
4217
4218 while (!Worklist.empty()) {
4219 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4220
4221 // We can't sink an instruction if it is a phi node, is already in the
4222 // predicated block, is not in the loop, or may have side effects.
4223 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4224 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4225 continue;
4226
4227 // It's legal to sink the instruction if all its uses occur in the
4228 // predicated block. Otherwise, there's nothing to do yet, and we may
4229 // need to reanalyze the instruction.
4230 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4231 InstsToReanalyze.push_back(I);
4232 continue;
4233 }
4234
4235 // Move the instruction to the beginning of the predicated block, and add
4236 // it's operands to the worklist.
4237 I->moveBefore(&*PredBB->getFirstInsertionPt());
4238 Worklist.insert(I->op_begin(), I->op_end());
4239
4240 // The sinking may have enabled other instructions to be sunk, so we will
4241 // need to iterate.
4242 Changed = true;
4243 }
4244 } while (Changed);
4245}
4246
4247void InnerLoopVectorizer::fixNonInductionPHIs() {
4248 for (PHINode *OrigPhi : OrigPHIsToFix) {
4249 PHINode *NewPhi =
4250 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4251 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4252
4253 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4254 predecessors(OrigPhi->getParent()));
4255 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4256 predecessors(NewPhi->getParent()));
4257 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4258, __PRETTY_FUNCTION__))
4258 "Scalar and Vector BB should have the same number of predecessors")((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4258, __PRETTY_FUNCTION__))
;
4259
4260 // The insertion point in Builder may be invalidated by the time we get
4261 // here. Force the Builder insertion point to something valid so that we do
4262 // not run into issues during insertion point restore in
4263 // getOrCreateVectorValue calls below.
4264 Builder.SetInsertPoint(NewPhi);
4265
4266 // The predecessor order is preserved and we can rely on mapping between
4267 // scalar and vector block predecessors.
4268 for (unsigned i = 0; i < NumIncomingValues; ++i) {
4269 BasicBlock *NewPredBB = VectorBBPredecessors[i];
4270
4271 // When looking up the new scalar/vector values to fix up, use incoming
4272 // values from original phi.
4273 Value *ScIncV =
4274 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4275
4276 // Scalar incoming value may need a broadcast
4277 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4278 NewPhi->addIncoming(NewIncV, NewPredBB);
4279 }
4280 }
4281}
4282
4283void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4284 unsigned UF, ElementCount VF,
4285 bool IsPtrLoopInvariant,
4286 SmallBitVector &IsIndexLoopInvariant,
4287 VPTransformState &State) {
4288 // Construct a vector GEP by widening the operands of the scalar GEP as
4289 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4290 // results in a vector of pointers when at least one operand of the GEP
4291 // is vector-typed. Thus, to keep the representation compact, we only use
4292 // vector-typed operands for loop-varying values.
4293
4294 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4295 // If we are vectorizing, but the GEP has only loop-invariant operands,
4296 // the GEP we build (by only using vector-typed operands for
4297 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4298 // produce a vector of pointers, we need to either arbitrarily pick an
4299 // operand to broadcast, or broadcast a clone of the original GEP.
4300 // Here, we broadcast a clone of the original.
4301 //
4302 // TODO: If at some point we decide to scalarize instructions having
4303 // loop-invariant operands, this special case will no longer be
4304 // required. We would add the scalarization decision to
4305 // collectLoopScalars() and teach getVectorValue() to broadcast
4306 // the lane-zero scalar value.
4307 auto *Clone = Builder.Insert(GEP->clone());
4308 for (unsigned Part = 0; Part < UF; ++Part) {
4309 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4310 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4311 addMetadata(EntryPart, GEP);
4312 }
4313 } else {
4314 // If the GEP has at least one loop-varying operand, we are sure to
4315 // produce a vector of pointers. But if we are only unrolling, we want
4316 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4317 // produce with the code below will be scalar (if VF == 1) or vector
4318 // (otherwise). Note that for the unroll-only case, we still maintain
4319 // values in the vector mapping with initVector, as we do for other
4320 // instructions.
4321 for (unsigned Part = 0; Part < UF; ++Part) {
4322 // The pointer operand of the new GEP. If it's loop-invariant, we
4323 // won't broadcast it.
4324 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4325 : State.get(Operands.getOperand(0), Part);
4326
4327 // Collect all the indices for the new GEP. If any index is
4328 // loop-invariant, we won't broadcast it.
4329 SmallVector<Value *, 4> Indices;
4330 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4331 VPValue *Operand = Operands.getOperand(I);
4332 if (IsIndexLoopInvariant[I - 1])
4333 Indices.push_back(State.get(Operand, {0, 0}));
4334 else
4335 Indices.push_back(State.get(Operand, Part));
4336 }
4337
4338 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4339 // but it should be a vector, otherwise.
4340 auto *NewGEP =
4341 GEP->isInBounds()
4342 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4343 Indices)
4344 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4345 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4346, __PRETTY_FUNCTION__))
4346 "NewGEP is not a pointer vector")(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4346, __PRETTY_FUNCTION__))
;
4347 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4348 addMetadata(NewGEP, GEP);
4349 }
4350 }
4351}
4352
4353void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4354 ElementCount VF) {
4355 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4355, __PRETTY_FUNCTION__))
;
4356 PHINode *P = cast<PHINode>(PN);
4357 if (EnableVPlanNativePath) {
4358 // Currently we enter here in the VPlan-native path for non-induction
4359 // PHIs where all control flow is uniform. We simply widen these PHIs.
4360 // Create a vector phi with no operands - the vector phi operands will be
4361 // set at the end of vector code generation.
4362 Type *VecTy =
4363 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4364 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4365 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4366 OrigPHIsToFix.push_back(P);
4367
4368 return;
4369 }
4370
4371 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4372, __PRETTY_FUNCTION__))
4372 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4372, __PRETTY_FUNCTION__))
;
4373
4374 // In order to support recurrences we need to be able to vectorize Phi nodes.
4375 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4376 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4377 // this value when we vectorize all of the instructions that use the PHI.
4378 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4379 for (unsigned Part = 0; Part < UF; ++Part) {
4380 // This is phase one of vectorizing PHIs.
4381 bool ScalarPHI =
4382 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4383 Type *VecTy =
4384 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4385 Value *EntryPart = PHINode::Create(
4386 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4387 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4388 }
4389 return;
4390 }
4391
4392 setDebugLocFromInst(Builder, P);
4393
4394 // This PHINode must be an induction variable.
4395 // Make sure that we know about it.
4396 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4396, __PRETTY_FUNCTION__))
;
4397
4398 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4399 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4400
4401 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4402 // which can be found from the original scalar operations.
4403 switch (II.getKind()) {
4404 case InductionDescriptor::IK_NoInduction:
4405 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4405)
;
4406 case InductionDescriptor::IK_IntInduction:
4407 case InductionDescriptor::IK_FpInduction:
4408 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4408)
;
4409 case InductionDescriptor::IK_PtrInduction: {
4410 // Handle the pointer induction variable case.
4411 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4411, __PRETTY_FUNCTION__))
;
4412
4413 if (Cost->isScalarAfterVectorization(P, VF)) {
4414 // This is the normalized GEP that starts counting at zero.
4415 Value *PtrInd =
4416 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4417 // Determine the number of scalars we need to generate for each unroll
4418 // iteration. If the instruction is uniform, we only need to generate the
4419 // first lane. Otherwise, we generate all VF values.
4420 unsigned Lanes =
4421 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4422 for (unsigned Part = 0; Part < UF; ++Part) {
4423 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4424 Constant *Idx = ConstantInt::get(PtrInd->getType(),
4425 Lane + Part * VF.getKnownMinValue());
4426 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4427 Value *SclrGep =
4428 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4429 SclrGep->setName("next.gep");
4430 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4431 }
4432 }
4433 return;
4434 }
4435 assert(isa<SCEVConstant>(II.getStep()) &&((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4436, __PRETTY_FUNCTION__))
4436 "Induction step not a SCEV constant!")((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4436, __PRETTY_FUNCTION__))
;
4437 Type *PhiType = II.getStep()->getType();
4438
4439 // Build a pointer phi
4440 Value *ScalarStartValue = II.getStartValue();
4441 Type *ScStValueType = ScalarStartValue->getType();
4442 PHINode *NewPointerPhi =
4443 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4444 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4445
4446 // A pointer induction, performed by using a gep
4447 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4448 Instruction *InductionLoc = LoopLatch->getTerminator();
4449 const SCEV *ScalarStep = II.getStep();
4450 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4451 Value *ScalarStepValue =
4452 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4453 Value *InductionGEP = GetElementPtrInst::Create(
4454 ScStValueType->getPointerElementType(), NewPointerPhi,
4455 Builder.CreateMul(
4456 ScalarStepValue,
4457 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4458 "ptr.ind", InductionLoc);
4459 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4460
4461 // Create UF many actual address geps that use the pointer
4462 // phi as base and a vectorized version of the step value
4463 // (<step*0, ..., step*N>) as offset.
4464 for (unsigned Part = 0; Part < UF; ++Part) {
4465 SmallVector<Constant *, 8> Indices;
4466 // Create a vector of consecutive numbers from zero to VF.
4467 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4468 Indices.push_back(
4469 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4470 Constant *StartOffset = ConstantVector::get(Indices);
4471
4472 Value *GEP = Builder.CreateGEP(
4473 ScStValueType->getPointerElementType(), NewPointerPhi,
4474 Builder.CreateMul(
4475 StartOffset,
4476 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4477 "vector.gep"));
4478 VectorLoopValueMap.setVectorValue(P, Part, GEP);
4479 }
4480 }
4481 }
4482}
4483
4484/// A helper function for checking whether an integer division-related
4485/// instruction may divide by zero (in which case it must be predicated if
4486/// executed conditionally in the scalar code).
4487/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4488/// Non-zero divisors that are non compile-time constants will not be
4489/// converted into multiplication, so we will still end up scalarizing
4490/// the division, but can do so w/o predication.
4491static bool mayDivideByZero(Instruction &I) {
4492 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4493 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4494 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4495 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4496 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
;
4497 Value *Divisor = I.getOperand(1);
4498 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4499 return !CInt || CInt->isZero();
4500}
4501
4502void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4503 VPTransformState &State) {
4504 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4504, __PRETTY_FUNCTION__))
;
4505 switch (I.getOpcode()) {
4506 case Instruction::Call:
4507 case Instruction::Br:
4508 case Instruction::PHI:
4509 case Instruction::GetElementPtr:
4510 case Instruction::Select:
4511 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4511)
;
4512 case Instruction::UDiv:
4513 case Instruction::SDiv:
4514 case Instruction::SRem:
4515 case Instruction::URem:
4516 case Instruction::Add:
4517 case Instruction::FAdd:
4518 case Instruction::Sub:
4519 case Instruction::FSub:
4520 case Instruction::FNeg:
4521 case Instruction::Mul:
4522 case Instruction::FMul:
4523 case Instruction::FDiv:
4524 case Instruction::FRem:
4525 case Instruction::Shl:
4526 case Instruction::LShr:
4527 case Instruction::AShr:
4528 case Instruction::And:
4529 case Instruction::Or:
4530 case Instruction::Xor: {
4531 // Just widen unops and binops.
4532 setDebugLocFromInst(Builder, &I);
4533
4534 for (unsigned Part = 0; Part < UF; ++Part) {
4535 SmallVector<Value *, 2> Ops;
4536 for (VPValue *VPOp : User.operands())
4537 Ops.push_back(State.get(VPOp, Part));
4538
4539 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4540
4541 if (auto *VecOp = dyn_cast<Instruction>(V))
4542 VecOp->copyIRFlags(&I);
4543
4544 // Use this vector value for all users of the original instruction.
4545 VectorLoopValueMap.setVectorValue(&I, Part, V);
4546 addMetadata(V, &I);
4547 }
4548
4549 break;
4550 }
4551 case Instruction::ICmp:
4552 case Instruction::FCmp: {
4553 // Widen compares. Generate vector compares.
4554 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4555 auto *Cmp = cast<CmpInst>(&I);
4556 setDebugLocFromInst(Builder, Cmp);
4557 for (unsigned Part = 0; Part < UF; ++Part) {
4558 Value *A = State.get(User.getOperand(0), Part);
4559 Value *B = State.get(User.getOperand(1), Part);
4560 Value *C = nullptr;
4561 if (FCmp) {
4562 // Propagate fast math flags.
4563 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4564 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4565 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4566 } else {
4567 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4568 }
4569 VectorLoopValueMap.setVectorValue(&I, Part, C);
4570 addMetadata(C, &I);
4571 }
4572
4573 break;
4574 }
4575
4576 case Instruction::ZExt:
4577 case Instruction::SExt:
4578 case Instruction::FPToUI:
4579 case Instruction::FPToSI:
4580 case Instruction::FPExt:
4581 case Instruction::PtrToInt:
4582 case Instruction::IntToPtr:
4583 case Instruction::SIToFP:
4584 case Instruction::UIToFP:
4585 case Instruction::Trunc:
4586 case Instruction::FPTrunc:
4587 case Instruction::BitCast: {
4588 auto *CI = cast<CastInst>(&I);
4589 setDebugLocFromInst(Builder, CI);
4590
4591 /// Vectorize casts.
4592 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4592, __PRETTY_FUNCTION__))
;
4593 Type *DestTy =
4594 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4595
4596 for (unsigned Part = 0; Part < UF; ++Part) {
4597 Value *A = State.get(User.getOperand(0), Part);
4598 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4599 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4600 addMetadata(Cast, &I);
4601 }
4602 break;
4603 }
4604 default:
4605 // This instruction is not vectorized by simple widening.
4606 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4607 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4607)
;
4608 } // end of switch.
4609}
4610
4611void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4612 VPTransformState &State) {
4613 assert(!isa<DbgInfoIntrinsic>(I) &&((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4614, __PRETTY_FUNCTION__))
4614 "DbgInfoIntrinsic should have been dropped during VPlan construction")((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4614, __PRETTY_FUNCTION__))
;
4615 setDebugLocFromInst(Builder, &I);
4616
4617 Module *M = I.getParent()->getParent()->getParent();
4618 auto *CI = cast<CallInst>(&I);
4619
4620 SmallVector<Type *, 4> Tys;
4621 for (Value *ArgOperand : CI->arg_operands())
4622 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4623
4624 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4625
4626 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4627 // version of the instruction.
4628 // Is it beneficial to perform intrinsic call compared to lib call?
4629 bool NeedToScalarize = false;
4630 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4631 bool UseVectorIntrinsic =
4632 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4633 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4634, __PRETTY_FUNCTION__))
4634 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4634, __PRETTY_FUNCTION__))
;
4635
4636 for (unsigned Part = 0; Part < UF; ++Part) {
4637 SmallVector<Value *, 4> Args;
4638 for (auto &I : enumerate(ArgOperands.operands())) {
4639 // Some intrinsics have a scalar argument - don't replace it with a
4640 // vector.
4641 Value *Arg;
4642 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4643 Arg = State.get(I.value(), Part);
4644 else
4645 Arg = State.get(I.value(), {0, 0});
4646 Args.push_back(Arg);
4647 }
4648
4649 Function *VectorF;
4650 if (UseVectorIntrinsic) {
4651 // Use vector version of the intrinsic.
4652 Type *TysForDecl[] = {CI->getType()};
4653 if (VF.isVector()) {
4654 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4654, __PRETTY_FUNCTION__))
;
4655 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4656 }
4657 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4658 assert(VectorF && "Can't retrieve vector intrinsic.")((VectorF && "Can't retrieve vector intrinsic.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4658, __PRETTY_FUNCTION__))
;
4659 } else {
4660 // Use vector version of the function call.
4661 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4662#ifndef NDEBUG
4663 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4664, __PRETTY_FUNCTION__))
4664 "Can't create vector function.")((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4664, __PRETTY_FUNCTION__))
;
4665#endif
4666 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4667 }
4668 SmallVector<OperandBundleDef, 1> OpBundles;
4669 CI->getOperandBundlesAsDefs(OpBundles);
4670 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4671
4672 if (isa<FPMathOperator>(V))
4673 V->copyFastMathFlags(CI);
4674
4675 VectorLoopValueMap.setVectorValue(&I, Part, V);
4676 addMetadata(V, &I);
4677 }
4678}
4679
4680void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4681 VPUser &Operands,
4682 bool InvariantCond,
4683 VPTransformState &State) {
4684 setDebugLocFromInst(Builder, &I);
4685
4686 // The condition can be loop invariant but still defined inside the
4687 // loop. This means that we can't just use the original 'cond' value.
4688 // We have to take the 'vectorized' value and pick the first lane.
4689 // Instcombine will make this a no-op.
4690 auto *InvarCond =
4691 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4692
4693 for (unsigned Part = 0; Part < UF; ++Part) {
4694 Value *Cond =
4695 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4696 Value *Op0 = State.get(Operands.getOperand(1), Part);
4697 Value *Op1 = State.get(Operands.getOperand(2), Part);
4698 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4699 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4700 addMetadata(Sel, &I);
4701 }
4702}
4703
4704void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4705 // We should not collect Scalars more than once per VF. Right now, this
4706 // function is called from collectUniformsAndScalars(), which already does
4707 // this check. Collecting Scalars for VF=1 does not make any sense.
4708 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4709, __PRETTY_FUNCTION__))
4709 "This function should not be visited twice for the same VF")((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4709, __PRETTY_FUNCTION__))
;
4710
4711 SmallSetVector<Instruction *, 8> Worklist;
4712
4713 // These sets are used to seed the analysis with pointers used by memory
4714 // accesses that will remain scalar.
4715 SmallSetVector<Instruction *, 8> ScalarPtrs;
4716 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4717 auto *Latch = TheLoop->getLoopLatch();
4718
4719 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4720 // The pointer operands of loads and stores will be scalar as long as the
4721 // memory access is not a gather or scatter operation. The value operand of a
4722 // store will remain scalar if the store is scalarized.
4723 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4724 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4725 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4726, __PRETTY_FUNCTION__))
4726 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4726, __PRETTY_FUNCTION__))
;
4727 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4728 if (Ptr == Store->getValueOperand())
4729 return WideningDecision == CM_Scalarize;
4730 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4731, __PRETTY_FUNCTION__))
4731 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4731, __PRETTY_FUNCTION__))
;
4732 return WideningDecision != CM_GatherScatter;
4733 };
4734
4735 // A helper that returns true if the given value is a bitcast or
4736 // getelementptr instruction contained in the loop.
4737 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4738 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4739 isa<GetElementPtrInst>(V)) &&
4740 !TheLoop->isLoopInvariant(V);
4741 };
4742
4743 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4744 if (!isa<PHINode>(Ptr) ||
4745 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4746 return false;
4747 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4748 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4749 return false;
4750 return isScalarUse(MemAccess, Ptr);
4751 };
4752
4753 // A helper that evaluates a memory access's use of a pointer. If the
4754 // pointer is actually the pointer induction of a loop, it is being
4755 // inserted into Worklist. If the use will be a scalar use, and the
4756 // pointer is only used by memory accesses, we place the pointer in
4757 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4758 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4759 if (isScalarPtrInduction(MemAccess, Ptr)) {
4760 Worklist.insert(cast<Instruction>(Ptr));
4761 Instruction *Update = cast<Instruction>(
4762 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4763 Worklist.insert(Update);
4764 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
4765 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
4766 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
4767 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
4768 return;
4769 }
4770 // We only care about bitcast and getelementptr instructions contained in
4771 // the loop.
4772 if (!isLoopVaryingBitCastOrGEP(Ptr))
4773 return;
4774
4775 // If the pointer has already been identified as scalar (e.g., if it was
4776 // also identified as uniform), there's nothing to do.
4777 auto *I = cast<Instruction>(Ptr);
4778 if (Worklist.count(I))
4779 return;
4780
4781 // If the use of the pointer will be a scalar use, and all users of the
4782 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4783 // place the pointer in PossibleNonScalarPtrs.
4784 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4785 return isa<LoadInst>(U) || isa<StoreInst>(U);
4786 }))
4787 ScalarPtrs.insert(I);
4788 else
4789 PossibleNonScalarPtrs.insert(I);
4790 };
4791
4792 // We seed the scalars analysis with three classes of instructions: (1)
4793 // instructions marked uniform-after-vectorization and (2) bitcast,
4794 // getelementptr and (pointer) phi instructions used by memory accesses
4795 // requiring a scalar use.
4796 //
4797 // (1) Add to the worklist all instructions that have been identified as
4798 // uniform-after-vectorization.
4799 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4800
4801 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4802 // memory accesses requiring a scalar use. The pointer operands of loads and
4803 // stores will be scalar as long as the memory accesses is not a gather or
4804 // scatter operation. The value operand of a store will remain scalar if the
4805 // store is scalarized.
4806 for (auto *BB : TheLoop->blocks())
4807 for (auto &I : *BB) {
4808 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4809 evaluatePtrUse(Load, Load->getPointerOperand());
4810 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4811 evaluatePtrUse(Store, Store->getPointerOperand());
4812 evaluatePtrUse(Store, Store->getValueOperand());
4813 }
4814 }
4815 for (auto *I : ScalarPtrs)
4816 if (!PossibleNonScalarPtrs.count(I)) {
4817 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4818 Worklist.insert(I);
4819 }
4820
4821 // Insert the forced scalars.
4822 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4823 // induction variable when the PHI user is scalarized.
4824 auto ForcedScalar = ForcedScalars.find(VF);
4825 if (ForcedScalar != ForcedScalars.end())
4826 for (auto *I : ForcedScalar->second)
4827 Worklist.insert(I);
4828
4829 // Expand the worklist by looking through any bitcasts and getelementptr
4830 // instructions we've already identified as scalar. This is similar to the
4831 // expansion step in collectLoopUniforms(); however, here we're only
4832 // expanding to include additional bitcasts and getelementptr instructions.
4833 unsigned Idx = 0;
4834 while (Idx != Worklist.size()) {
4835 Instruction *Dst = Worklist[Idx++];
4836 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4837 continue;
4838 auto *Src = cast<Instruction>(Dst->getOperand(0));
4839 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4840 auto *J = cast<Instruction>(U);
4841 return !TheLoop->contains(J) || Worklist.count(J) ||
4842 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4843 isScalarUse(J, Src));
4844 })) {
4845 Worklist.insert(Src);
4846 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4847 }
4848 }
4849
4850 // An induction variable will remain scalar if all users of the induction
4851 // variable and induction variable update remain scalar.
4852 for (auto &Induction : Legal->getInductionVars()) {
4853 auto *Ind = Induction.first;
4854 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4855
4856 // If tail-folding is applied, the primary induction variable will be used
4857 // to feed a vector compare.
4858 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4859 continue;
4860
4861 // Determine if all users of the induction variable are scalar after
4862 // vectorization.
4863 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4864 auto *I = cast<Instruction>(U);
4865 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4866 });
4867 if (!ScalarInd)
4868 continue;
4869
4870 // Determine if all users of the induction variable update instruction are
4871 // scalar after vectorization.
4872 auto ScalarIndUpdate =
4873 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4874 auto *I = cast<Instruction>(U);
4875 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4876 });
4877 if (!ScalarIndUpdate)
4878 continue;
4879
4880 // The induction variable and its update instruction will remain scalar.
4881 Worklist.insert(Ind);
4882 Worklist.insert(IndUpdate);
4883 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4884 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4885 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4886 }
4887
4888 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4889}
4890
4891bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4892 ElementCount VF) {
4893 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4893, __PRETTY_FUNCTION__))
;
4894 if (!blockNeedsPredication(I->getParent()))
4895 return false;
4896 switch(I->getOpcode()) {
4897 default:
4898 break;
4899 case Instruction::Load:
4900 case Instruction::Store: {
4901 if (!Legal->isMaskRequired(I))
4902 return false;
4903 auto *Ptr = getLoadStorePointerOperand(I);
4904 auto *Ty = getMemInstValueType(I);
4905 // We have already decided how to vectorize this instruction, get that
4906 // result.
4907 if (VF.isVector()) {
4908 InstWidening WideningDecision = getWideningDecision(I, VF);
4909 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4910, __PRETTY_FUNCTION__))
4910 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4910, __PRETTY_FUNCTION__))
;
4911 return WideningDecision == CM_Scalarize;
4912 }
4913 const Align Alignment = getLoadStoreAlignment(I);
4914 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4915 isLegalMaskedGather(Ty, Alignment))
4916 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4917 isLegalMaskedScatter(Ty, Alignment));
4918 }
4919 case Instruction::UDiv:
4920 case Instruction::SDiv:
4921 case Instruction::SRem:
4922 case Instruction::URem:
4923 return mayDivideByZero(*I);
4924 }
4925 return false;
4926}
4927
4928bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4929 Instruction *I, ElementCount VF) {
4930 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4930, __PRETTY_FUNCTION__))
;
4931 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4932, __PRETTY_FUNCTION__))
4932 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4932, __PRETTY_FUNCTION__))
;
4933 auto *Group = getInterleavedAccessGroup(I);
4934 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4934, __PRETTY_FUNCTION__))
;
4935
4936 // If the instruction's allocated size doesn't equal it's type size, it
4937 // requires padding and will be scalarized.
4938 auto &DL = I->getModule()->getDataLayout();
4939 auto *ScalarTy = getMemInstValueType(I);
4940 if (hasIrregularType(ScalarTy, DL, VF))
4941 return false;
4942
4943 // Check if masking is required.
4944 // A Group may need masking for one of two reasons: it resides in a block that
4945 // needs predication, or it was decided to use masking to deal with gaps.
4946 bool PredicatedAccessRequiresMasking =
4947 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4948 bool AccessWithGapsRequiresMasking =
4949 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4950 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4951 return true;
4952
4953 // If masked interleaving is required, we expect that the user/target had
4954 // enabled it, because otherwise it either wouldn't have been created or
4955 // it should have been invalidated by the CostModel.
4956 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4957, __PRETTY_FUNCTION__))
4957 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4957, __PRETTY_FUNCTION__))
;
4958
4959 auto *Ty = getMemInstValueType(I);
4960 const Align Alignment = getLoadStoreAlignment(I);
4961 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4962 : TTI.isLegalMaskedStore(Ty, Alignment);
4963}
4964
4965bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4966 Instruction *I, ElementCount VF) {
4967 // Get and ensure we have a valid memory instruction.
4968 LoadInst *LI = dyn_cast<LoadInst>(I);
4969 StoreInst *SI = dyn_cast<StoreInst>(I);
4970 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4970, __PRETTY_FUNCTION__))
;
4971
4972 auto *Ptr = getLoadStorePointerOperand(I);
4973
4974 // In order to be widened, the pointer should be consecutive, first of all.
4975 if (!Legal->isConsecutivePtr(Ptr))
4976 return false;
4977
4978 // If the instruction is a store located in a predicated block, it will be
4979 // scalarized.
4980 if (isScalarWithPredication(I))
4981 return false;
4982
4983 // If the instruction's allocated size doesn't equal it's type size, it
4984 // requires padding and will be scalarized.
4985 auto &DL = I->getModule()->getDataLayout();
4986 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4987 if (hasIrregularType(ScalarTy, DL, VF))
4988 return false;
4989
4990 return true;
4991}
4992
4993void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4994 // We should not collect Uniforms more than once per VF. Right now,
4995 // this function is called from collectUniformsAndScalars(), which
4996 // already does this check. Collecting Uniforms for VF=1 does not make any
4997 // sense.
4998
4999 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5000, __PRETTY_FUNCTION__))
5000 "This function should not be visited twice for the same VF")((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5000, __PRETTY_FUNCTION__))
;
5001
5002 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5003 // not analyze again. Uniforms.count(VF) will return 1.
5004 Uniforms[VF].clear();
5005
5006 // We now know that the loop is vectorizable!
5007 // Collect instructions inside the loop that will remain uniform after
5008 // vectorization.
5009
5010 // Global values, params and instructions outside of current loop are out of
5011 // scope.
5012 auto isOutOfScope = [&](Value *V) -> bool {
5013 Instruction *I = dyn_cast<Instruction>(V);
5014 return (!I || !TheLoop->contains(I));
5015 };
5016
5017 SetVector<Instruction *> Worklist;
5018 BasicBlock *Latch = TheLoop->getLoopLatch();
5019
5020 // Instructions that are scalar with predication must not be considered
5021 // uniform after vectorization, because that would create an erroneous
5022 // replicating region where only a single instance out of VF should be formed.
5023 // TODO: optimize such seldom cases if found important, see PR40816.
5024 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5025 if (isScalarWithPredication(I, VF)) {
5026 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5027 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5028 return;
5029 }
5030 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5031 Worklist.insert(I);
5032 };
5033
5034 // Start with the conditional branch. If the branch condition is an
5035 // instruction contained in the loop that is only used by the branch, it is
5036 // uniform.
5037 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5038 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5039 addToWorklistIfAllowed(Cmp);
5040
5041 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5042 // are pointers that are treated like consecutive pointers during
5043 // vectorization. The pointer operands of interleaved accesses are an
5044 // example.
5045 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5046
5047 // Holds pointer operands of instructions that are possibly non-uniform.
5048 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5049
5050 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5051 InstWidening WideningDecision = getWideningDecision(I, VF);
5052 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5053, __PRETTY_FUNCTION__))
5053 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5053, __PRETTY_FUNCTION__))
;
5054
5055 return (WideningDecision == CM_Widen ||
5056 WideningDecision == CM_Widen_Reverse ||
5057 WideningDecision == CM_Interleave);
5058 };
5059 // Iterate over the instructions in the loop, and collect all
5060 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5061 // that a consecutive-like pointer operand will be scalarized, we collect it
5062 // in PossibleNonUniformPtrs instead. We use two sets here because a single
5063 // getelementptr instruction can be used by both vectorized and scalarized
5064 // memory instructions. For example, if a loop loads and stores from the same
5065 // location, but the store is conditional, the store will be scalarized, and
5066 // the getelementptr won't remain uniform.
5067 for (auto *BB : TheLoop->blocks())
5068 for (auto &I : *BB) {
5069 // If there's no pointer operand, there's nothing to do.
5070 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5071 if (!Ptr)
5072 continue;
5073
5074 // True if all users of Ptr are memory accesses that have Ptr as their
5075 // pointer operand.
5076 auto UsersAreMemAccesses =
5077 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5078 return getLoadStorePointerOperand(U) == Ptr;
5079 });
5080
5081 // Ensure the memory instruction will not be scalarized or used by
5082 // gather/scatter, making its pointer operand non-uniform. If the pointer
5083 // operand is used by any instruction other than a memory access, we
5084 // conservatively assume the pointer operand may be non-uniform.
5085 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5086 PossibleNonUniformPtrs.insert(Ptr);
5087
5088 // If the memory instruction will be vectorized and its pointer operand
5089 // is consecutive-like, or interleaving - the pointer operand should
5090 // remain uniform.
5091 else
5092 ConsecutiveLikePtrs.insert(Ptr);
5093 }
5094
5095 // Add to the Worklist all consecutive and consecutive-like pointers that
5096 // aren't also identified as possibly non-uniform.
5097 for (auto *V : ConsecutiveLikePtrs)
5098 if (!PossibleNonUniformPtrs.count(V))
5099 addToWorklistIfAllowed(V);
5100
5101 // Expand Worklist in topological order: whenever a new instruction
5102 // is added , its users should be already inside Worklist. It ensures
5103 // a uniform instruction will only be used by uniform instructions.
5104 unsigned idx = 0;
5105 while (idx != Worklist.size()) {
5106 Instruction *I = Worklist[idx++];
5107
5108 for (auto OV : I->operand_values()) {
5109 // isOutOfScope operands cannot be uniform instructions.
5110 if (isOutOfScope(OV))
5111 continue;
5112 // First order recurrence Phi's should typically be considered
5113 // non-uniform.
5114 auto *OP = dyn_cast<PHINode>(OV);
5115 if (OP && Legal->isFirstOrderRecurrence(OP))
5116 continue;
5117 // If all the users of the operand are uniform, then add the
5118 // operand into the uniform worklist.
5119 auto *OI = cast<Instruction>(OV);
5120 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5121 auto *J = cast<Instruction>(U);
5122 return Worklist.count(J) ||
5123 (OI == getLoadStorePointerOperand(J) &&
5124 isUniformDecision(J, VF));
5125 }))
5126 addToWorklistIfAllowed(OI);
5127 }
5128 }
5129
5130 // Returns true if Ptr is the pointer operand of a memory access instruction
5131 // I, and I is known to not require scalarization.
5132 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5133 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5134 };
5135
5136 // For an instruction to be added into Worklist above, all its users inside
5137 // the loop should also be in Worklist. However, this condition cannot be
5138 // true for phi nodes that form a cyclic dependence. We must process phi
5139 // nodes separately. An induction variable will remain uniform if all users
5140 // of the induction variable and induction variable update remain uniform.
5141 // The code below handles both pointer and non-pointer induction variables.
5142 for (auto &Induction : Legal->getInductionVars()) {
5143 auto *Ind = Induction.first;
5144 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5145
5146 // Determine if all users of the induction variable are uniform after
5147 // vectorization.
5148 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5149 auto *I = cast<Instruction>(U);
5150 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5151 isVectorizedMemAccessUse(I, Ind);
5152 });
5153 if (!UniformInd)
5154 continue;
5155
5156 // Determine if all users of the induction variable update instruction are
5157 // uniform after vectorization.
5158 auto UniformIndUpdate =
5159 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5160 auto *I = cast<Instruction>(U);
5161 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5162 isVectorizedMemAccessUse(I, IndUpdate);
5163 });
5164 if (!UniformIndUpdate)
5165 continue;
5166
5167 // The induction variable and its update instruction will remain uniform.
5168 addToWorklistIfAllowed(Ind);
5169 addToWorklistIfAllowed(IndUpdate);
5170 }
5171
5172 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5173}
5174
5175bool LoopVectorizationCostModel::runtimeChecksRequired() {
5176 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
5177
5178 if (Legal->getRuntimePointerChecking()->Need) {
5179 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5180 "runtime pointer checks needed. Enable vectorization of this "
5181 "loop with '#pragma clang loop vectorize(enable)' when "
5182 "compiling with -Os/-Oz",
5183 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5184 return true;
5185 }
5186
5187 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5188 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5189 "runtime SCEV checks needed. Enable vectorization of this "
5190 "loop with '#pragma clang loop vectorize(enable)' when "
5191 "compiling with -Os/-Oz",
5192 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5193 return true;
5194 }
5195
5196 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5197 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5198 reportVectorizationFailure("Runtime stride check for small trip count",
5199 "runtime stride == 1 checks needed. Enable vectorization of "
5200 "this loop without such check by compiling with -Os/-Oz",
5201 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5202 return true;
5203 }
5204
5205 return false;
5206}
5207
5208Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5209 unsigned UserIC) {
5210 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5211 // TODO: It may by useful to do since it's still likely to be dynamically
5212 // uniform if the target can skip.
5213 reportVectorizationFailure(
5214 "Not inserting runtime ptr check for divergent target",
5215 "runtime pointer checks needed. Not enabled for divergent target",
5216 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5217 return None;
5218 }
5219
5220 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5221 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5222 if (TC == 1) {
5223 reportVectorizationFailure("Single iteration (non) loop",
5224 "loop trip count is one, irrelevant for vectorization",
5225 "SingleIterationLoop", ORE, TheLoop);
5226 return None;
5227 }
5228
5229 switch (ScalarEpilogueStatus) {
5230 case CM_ScalarEpilogueAllowed:
5231 return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5232 case CM_ScalarEpilogueNotNeededUsePredicate:
5233 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5234 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5235 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5236 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5237 break;
5238 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5239 // fallthrough as a special case of OptForSize
5240 case CM_ScalarEpilogueNotAllowedOptSize:
5241 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5242 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5243 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5244 else
5245 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5246 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5247
5248 // Bail if runtime checks are required, which are not good when optimising
5249 // for size.
5250 if (runtimeChecksRequired())
5251 return None;
5252 break;
5253 }
5254
5255 // Now try the tail folding
5256
5257 // Invalidate interleave groups that require an epilogue if we can't mask
5258 // the interleave-group.
5259 if (!useMaskedInterleavedAccesses(TTI)) {
5260 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5261, __PRETTY_FUNCTION__))
5261 "No decisions should have been taken at this point")((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5261, __PRETTY_FUNCTION__))
;
5262 // Note: There is no need to invalidate any cost modeling decisions here, as
5263 // non where taken so far.
5264 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5265 }
5266
5267 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5268 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2")(((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("(UserVF || isPowerOf2_32(MaxVF)) && \"MaxVF must be a power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5268, __PRETTY_FUNCTION__))
;
5269 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5270 if (TC > 0 && TC % MaxVFtimesIC == 0) {
5271 // Accept MaxVF if we do not have a tail.
5272 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5273 return MaxVF;
5274 }
5275
5276 // If we don't know the precise trip count, or if the trip count that we
5277 // found modulo the vectorization factor is not zero, try to fold the tail
5278 // by masking.
5279 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5280 if (Legal->prepareToFoldTailByMasking()) {
5281 FoldTailByMasking = true;
5282 return MaxVF;
5283 }
5284
5285 // If there was a tail-folding hint/switch, but we can't fold the tail by
5286 // masking, fallback to a vectorization with a scalar epilogue.
5287 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5288 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5289 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5290 return None;
5291 }
5292 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5293 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5294 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5295 return MaxVF;
5296 }
5297
5298 if (TC == 0) {
5299 reportVectorizationFailure(
5300 "Unable to calculate the loop count due to complex control flow",
5301 "unable to calculate the loop count due to complex control flow",
5302 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5303 return None;
5304 }
5305
5306 reportVectorizationFailure(
5307 "Cannot optimize for size and vectorize at the same time.",
5308 "cannot optimize for size and vectorize at the same time. "
5309 "Enable vectorization of this loop with '#pragma clang loop "
5310 "vectorize(enable)' when compiling with -Os/-Oz",
5311 "NoTailLoopWithOptForSize", ORE, TheLoop);
5312 return None;
5313}
5314
5315unsigned
5316LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5317 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5318 unsigned SmallestType, WidestType;
5319 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5320 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5321
5322 // Get the maximum safe dependence distance in bits computed by LAA.
5323 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5324 // the memory accesses that is most restrictive (involved in the smallest
5325 // dependence distance).
5326 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5327
5328 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5329
5330 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5331 // Note that both WidestRegister and WidestType may not be a powers of 2.
5332 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5333
5334 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5335 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5336 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
5337 << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
;
5338
5339 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5340, __PRETTY_FUNCTION__))
5340 " into one vector!")((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5340, __PRETTY_FUNCTION__))
;
5341 if (MaxVectorSize == 0) {
5342 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false)
;
5343 MaxVectorSize = 1;
5344 return MaxVectorSize;
5345 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5346 isPowerOf2_32(ConstTripCount)) {
5347 // We need to clamp the VF to be the ConstTripCount. There is no point in
5348 // choosing a higher viable VF as done in the loop below.
5349 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
5350 << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
;
5351 MaxVectorSize = ConstTripCount;
5352 return MaxVectorSize;
5353 }
5354
5355 unsigned MaxVF = MaxVectorSize;
5356 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5357 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5358 // Collect all viable vectorization factors larger than the default MaxVF
5359 // (i.e. MaxVectorSize).
5360 SmallVector<ElementCount, 8> VFs;
5361 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5362 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5363 VFs.push_back(ElementCount::getFixed(VS));
5364
5365 // For each VF calculate its register usage.
5366 auto RUs = calculateRegisterUsage(VFs);
5367
5368 // Select the largest VF which doesn't require more registers than existing
5369 // ones.
5370 for (int i = RUs.size() - 1; i >= 0; --i) {
5371 bool Selected = true;
5372 for (auto& pair : RUs[i].MaxLocalUsers) {
5373 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5374 if (pair.second > TargetNumRegisters)
5375 Selected = false;
5376 }
5377 if (Selected) {
5378 MaxVF = VFs[i].getKnownMinValue();
5379 break;
5380 }
5381 }
5382 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5383 if (MaxVF < MinVF) {
5384 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5385 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5386 MaxVF = MinVF;
5387 }
5388 }
5389 }
5390 return MaxVF;
5391}
5392
5393VectorizationFactor
5394LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5395 float Cost = expectedCost(ElementCount::getFixed(1)).first;
5396 const float ScalarCost = Cost;
5397 unsigned Width = 1;
5398 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< (int)ScalarCost << ".\n"; } } while (false)
;
5399
5400 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5401 if (ForceVectorization && MaxVF > 1) {
5402 // Ignore scalar width, because the user explicitly wants vectorization.
5403 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5404 // evaluation.
5405 Cost = std::numeric_limits<float>::max();
5406 }
5407
5408 for (unsigned i = 2; i <= MaxVF; i *= 2) {
5409 // Notice that the vector loop needs to be executed less times, so
5410 // we need to divide the cost of the vector loops by the width of
5411 // the vector elements.
5412 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5413 float VectorCost = C.first / (float)i;
5414 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
5415 << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
;
5416 if (!C.second && !ForceVectorization) {
5417 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5418 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5419 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5420 continue;
5421 }
5422 if (VectorCost < Cost) {
5423 Cost = VectorCost;
5424 Width = i;
5425 }
5426 }
5427
5428 if (!EnableCondStoresVectorization && NumPredStores) {
5429 reportVectorizationFailure("There are conditional stores.",
5430 "store that is conditionally executed prevents vectorization",
5431 "ConditionalStore", ORE, TheLoop);
5432 Width = 1;
5433 Cost = ScalarCost;
5434 }
5435
5436 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5437 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5438 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
5439 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
Width << ".\n"; } } while (false)
;
5440 VectorizationFactor Factor = {ElementCount::getFixed(Width),
5441 (unsigned)(Width * Cost)};
5442 return Factor;
5443}
5444
5445std::pair<unsigned, unsigned>
5446LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5447 unsigned MinWidth = -1U;
5448 unsigned MaxWidth = 8;
5449 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5450
5451 // For each block.
5452 for (BasicBlock *BB : TheLoop->blocks()) {
5453 // For each instruction in the loop.
5454 for (Instruction &I : BB->instructionsWithoutDebug()) {
5455 Type *T = I.getType();
5456
5457 // Skip ignored values.
5458 if (ValuesToIgnore.count(&I))
5459 continue;
5460
5461 // Only examine Loads, Stores and PHINodes.
5462 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5463 continue;
5464
5465 // Examine PHI nodes that are reduction variables. Update the type to
5466 // account for the recurrence type.
5467 if (auto *PN = dyn_cast<PHINode>(&I)) {
5468 if (!Legal->isReductionVariable(PN))
5469 continue;
5470 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5471 T = RdxDesc.getRecurrenceType();
5472 }
5473
5474 // Examine the stored values.
5475 if (auto *ST = dyn_cast<StoreInst>(&I))
5476 T = ST->getValueOperand()->getType();
5477
5478 // Ignore loaded pointer types and stored pointer types that are not
5479 // vectorizable.
5480 //
5481 // FIXME: The check here attempts to predict whether a load or store will
5482 // be vectorized. We only know this for certain after a VF has
5483 // been selected. Here, we assume that if an access can be
5484 // vectorized, it will be. We should also look at extending this
5485 // optimization to non-pointer types.
5486 //
5487 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5488 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5489 continue;
5490
5491 MinWidth = std::min(MinWidth,
5492 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5493 MaxWidth = std::max(MaxWidth,
5494 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5495 }
5496 }
5497
5498 return {MinWidth, MaxWidth};
5499}
5500
5501unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5502 unsigned LoopCost) {
5503 // -- The interleave heuristics --
5504 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5505 // There are many micro-architectural considerations that we can't predict
5506 // at this level. For example, frontend pressure (on decode or fetch) due to
5507 // code size, or the number and capabilities of the execution ports.
5508 //
5509 // We use the following heuristics to select the interleave count:
5510 // 1. If the code has reductions, then we interleave to break the cross
5511 // iteration dependency.
5512 // 2. If the loop is really small, then we interleave to reduce the loop
5513 // overhead.
5514 // 3. We don't interleave if we think that we will spill registers to memory
5515 // due to the increased register pressure.
5516
5517 if (!isScalarEpilogueAllowed())
5518 return 1;
5519
5520 // We used the distance for the interleave count.
5521 if (Legal->getMaxSafeDepDistBytes() != -1U)
5522 return 1;
5523
5524 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5525 const bool HasReductions = !Legal->getReductionVars().empty();
5526 // Do not interleave loops with a relatively small known or estimated trip
5527 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5528 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5529 // because with the above conditions interleaving can expose ILP and break
5530 // cross iteration dependences for reductions.
5531 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5532 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5533 return 1;
5534
5535 RegisterUsage R = calculateRegisterUsage({VF})[0];
5536 // We divide by these constants so assume that we have at least one
5537 // instruction that uses at least one register.
5538 for (auto& pair : R.MaxLocalUsers) {
5539 pair.second = std::max(pair.second, 1U);
5540 }
5541
5542 // We calculate the interleave count using the following formula.
5543 // Subtract the number of loop invariants from the number of available
5544 // registers. These registers are used by all of the interleaved instances.
5545 // Next, divide the remaining registers by the number of registers that is
5546 // required by the loop, in order to estimate how many parallel instances
5547 // fit without causing spills. All of this is rounded down if necessary to be
5548 // a power of two. We want power of two interleave count to simplify any
5549 // addressing operations or alignment considerations.
5550 // We also want power of two interleave counts to ensure that the induction
5551 // variable of the vector loop wraps to zero, when tail is folded by masking;
5552 // this currently happens when OptForSize, in which case IC is set to 1 above.
5553 unsigned IC = UINT_MAX(2147483647 *2U +1U);
5554
5555 for (auto& pair : R.MaxLocalUsers) {
5556 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5557 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5558 << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5559 << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
;
5560 if (VF.isScalar()) {
5561 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5562 TargetNumRegisters = ForceTargetNumScalarRegs;
5563 } else {
5564 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5565 TargetNumRegisters = ForceTargetNumVectorRegs;
5566 }
5567 unsigned MaxLocalUsers = pair.second;
5568 unsigned LoopInvariantRegs = 0;
5569 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5570 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5571
5572 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5573 // Don't count the induction variable as interleaved.
5574 if (EnableIndVarRegisterHeur) {
5575 TmpIC =
5576 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5577 std::max(1U, (MaxLocalUsers - 1)));
5578 }
5579
5580 IC = std::min(IC, TmpIC);
5581 }
5582
5583 // Clamp the interleave ranges to reasonable counts.
5584 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5584, __PRETTY_FUNCTION__))
;
5585 unsigned MaxInterleaveCount =
5586 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5587
5588 // Check if the user has overridden the max.
5589 if (VF.isScalar()) {
5590 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5591 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5592 } else {
5593 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5594 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5595 }
5596
5597 // If trip count is known or estimated compile time constant, limit the
5598 // interleave count to be less than the trip count divided by VF.
5599 if (BestKnownTC) {
5600 MaxInterleaveCount =
5601 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5602 }
5603
5604 // If we did not calculate the cost for VF (because the user selected the VF)
5605 // then we calculate the cost of VF here.
5606 if (LoopCost == 0)
5607 LoopCost = expectedCost(VF).first;
5608
5609 assert(LoopCost && "Non-zero loop cost expected")((LoopCost && "Non-zero loop cost expected") ? static_cast
<void> (0) : __assert_fail ("LoopCost && \"Non-zero loop cost expected\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5609, __PRETTY_FUNCTION__))
;
5610
5611 // Clamp the calculated IC to be between the 1 and the max interleave count
5612 // that the target and trip count allows.
5613 if (IC > MaxInterleaveCount)
5614 IC = MaxInterleaveCount;
5615 else if (IC < 1)
5616 IC = 1;
5617
5618 // Interleave if we vectorized this loop and there is a reduction that could
5619 // benefit from interleaving.
5620 if (VF.isVector() && HasReductions) {
5621 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
5622 return IC;
5623 }
5624
5625 // Note that if we've already vectorized the loop we will have done the
5626 // runtime check and so interleaving won't require further checks.
5627 bool InterleavingRequiresRuntimePointerCheck =
5628 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5629
5630 // We want to interleave small loops in order to reduce the loop overhead and
5631 // potentially expose ILP opportunities.
5632 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF.getKnownMinValue() <<
'\n'; } } while (false)
5633 << "LV: IC is " << IC << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF.getKnownMinValue() <<
'\n'; } } while (false)
5634 << "LV: VF is " << VF.getKnownMinValue() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF.getKnownMinValue() <<
'\n'; } } while (false)
;
5635 const bool AggressivelyInterleaveReductions =
5636 TTI.enableAggressiveInterleaving(HasReductions);
5637 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5638 // We assume that the cost overhead is 1 and we use the cost model
5639 // to estimate the cost of the loop and interleave until the cost of the
5640 // loop overhead is about 5% of the cost of the loop.
5641 unsigned SmallIC =
5642 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5643
5644 // Interleave until store/load ports (estimated by max interleave count) are
5645 // saturated.
5646 unsigned NumStores = Legal->getNumStores();
5647 unsigned NumLoads = Legal->getNumLoads();
5648 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5649 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5650
5651 // If we have a scalar reduction (vector reductions are already dealt with
5652 // by this point), we can increase the critical path length if the loop
5653 // we're interleaving is inside another loop. Limit, by default to 2, so the
5654 // critical path only gets increased by one reduction operation.
5655 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5656 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5657 SmallIC = std::min(SmallIC, F);
5658 StoresIC = std::min(StoresIC, F);
5659 LoadsIC = std::min(LoadsIC, F);
5660 }
5661
5662 if (EnableLoadStoreRuntimeInterleave &&
5663 std::max(StoresIC, LoadsIC) > SmallIC) {
5664 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
5665 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
5666 return std::max(StoresIC, LoadsIC);
5667 }
5668
5669 // If there are scalar reductions and TTI has enabled aggressive
5670 // interleaving for reductions, we will interleave to expose ILP.
5671 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5672 AggressivelyInterleaveReductions) {
5673 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5674 // Interleave no less than SmallIC but not as aggressive as the normal IC
5675 // to satisfy the rare situation when resources are too limited.
5676 return std::max(IC / 2, SmallIC);
5677 } else {
5678 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
5679 return SmallIC;
5680 }
5681 }
5682
5683 // Interleave if this is a large loop (small loops are already dealt with by
5684 // this point) that could benefit from interleaving.
5685 if (AggressivelyInterleaveReductions) {
5686 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5687 return IC;
5688 }
5689
5690 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
5691 return 1;
5692}
5693
5694SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5695LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5696 // This function calculates the register usage by measuring the highest number
5697 // of values that are alive at a single location. Obviously, this is a very
5698 // rough estimation. We scan the loop in a topological order in order and
5699 // assign a number to each instruction. We use RPO to ensure that defs are
5700 // met before their users. We assume that each instruction that has in-loop
5701 // users starts an interval. We record every time that an in-loop value is
5702 // used, so we have a list of the first and last occurrences of each
5703 // instruction. Next, we transpose this data structure into a multi map that
5704 // holds the list of intervals that *end* at a specific location. This multi
5705 // map allows us to perform a linear search. We scan the instructions linearly
5706 // and record each time that a new interval starts, by placing it in a set.
5707 // If we find this value in the multi-map then we remove it from the set.
5708 // The max register usage is the maximum size of the set.
5709 // We also search for instructions that are defined outside the loop, but are
5710 // used inside the loop. We need this number separately from the max-interval
5711 // usage number because when we unroll, loop-invariant values do not take
5712 // more register.
5713 LoopBlocksDFS DFS(TheLoop);
5714 DFS.perform(LI);
5715
5716 RegisterUsage RU;
5717
5718 // Each 'key' in the map opens a new interval. The values
5719 // of the map are the index of the 'last seen' usage of the
5720 // instruction that is the key.
5721 using IntervalMap = DenseMap<Instruction *, unsigned>;
5722
5723 // Maps instruction to its index.
5724 SmallVector<Instruction *, 64> IdxToInstr;
5725 // Marks the end of each interval.
5726 IntervalMap EndPoint;
5727 // Saves the list of instruction indices that are used in the loop.
5728 SmallPtrSet<Instruction *, 8> Ends;
5729 // Saves the list of values that are used in the loop but are
5730 // defined outside the loop, such as arguments and constants.
5731 SmallPtrSet<Value *, 8> LoopInvariants;
5732
5733 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5734 for (Instruction &I : BB->instructionsWithoutDebug()) {
5735 IdxToInstr.push_back(&I);
5736
5737 // Save the end location of each USE.
5738 for (Value *U : I.operands()) {
5739 auto *Instr = dyn_cast<Instruction>(U);
5740
5741 // Ignore non-instruction values such as arguments, constants, etc.
5742 if (!Instr)
5743 continue;
5744
5745 // If this instruction is outside the loop then record it and continue.
5746 if (!TheLoop->contains(Instr)) {
5747 LoopInvariants.insert(Instr);
5748 continue;
5749 }
5750
5751 // Overwrite previous end points.
5752 EndPoint[Instr] = IdxToInstr.size();
5753 Ends.insert(Instr);
5754 }
5755 }
5756 }
5757
5758 // Saves the list of intervals that end with the index in 'key'.
5759 using InstrList = SmallVector<Instruction *, 2>;
5760 DenseMap<unsigned, InstrList> TransposeEnds;
5761
5762 // Transpose the EndPoints to a list of values that end at each index.
5763 for (auto &Interval : EndPoint)
5764 TransposeEnds[Interval.second].push_back(Interval.first);
5765
5766 SmallPtrSet<Instruction *, 8> OpenIntervals;
5767
5768 // Get the size of the widest register.
5769 unsigned MaxSafeDepDist = -1U;
5770 if (Legal->getMaxSafeDepDistBytes() != -1U)
5771 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5772 unsigned WidestRegister =
5773 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5774 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5775
5776 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5777 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5778
5779 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
5780
5781 // A lambda that gets the register usage for the given type and VF.
5782 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5783 if (Ty->isTokenTy())
5784 return 0U;
5785 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5786 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5786, __PRETTY_FUNCTION__))
;
5787 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
5788 WidestRegister);
5789 };
5790
5791 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5792 Instruction *I = IdxToInstr[i];
5793
5794 // Remove all of the instructions that end at this location.
5795 InstrList &List = TransposeEnds[i];
5796 for (Instruction *ToRemove : List)
5797 OpenIntervals.erase(ToRemove);
5798
5799 // Ignore instructions that are never used within the loop.
5800 if (!Ends.count(I))
5801 continue;
5802
5803 // Skip ignored values.
5804 if (ValuesToIgnore.count(I))
5805 continue;
5806
5807 // For each VF find the maximum usage of registers.
5808 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5809 // Count the number of live intervals.
5810 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5811
5812 if (VFs[j].isScalar()) {
5813 for (auto Inst : OpenIntervals) {
5814 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5815 if (RegUsage.find(ClassID) == RegUsage.end())
5816 RegUsage[ClassID] = 1;
5817 else
5818 RegUsage[ClassID] += 1;
5819 }
5820 } else {
5821 collectUniformsAndScalars(VFs[j]);
5822 for (auto Inst : OpenIntervals) {
5823 // Skip ignored values for VF > 1.
5824 if (VecValuesToIgnore.count(Inst))
5825 continue;
5826 if (isScalarAfterVectorization(Inst, VFs[j])) {
5827 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5828 if (RegUsage.find(ClassID) == RegUsage.end())
5829 RegUsage[ClassID] = 1;
5830 else
5831 RegUsage[ClassID] += 1;
5832 } else {
5833 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5834 if (RegUsage.find(ClassID) == RegUsage.end())
5835 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5836 else
5837 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5838 }
5839 }
5840 }
5841
5842 for (auto& pair : RegUsage) {
5843 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5844 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5845 else
5846 MaxUsages[j][pair.first] = pair.second;
5847 }
5848 }
5849
5850 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
5851 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
5852
5853 // Add the current instruction to the list of open intervals.
5854 OpenIntervals.insert(I);
5855 }
5856
5857 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5858 SmallMapVector<unsigned, unsigned, 4> Invariant;
5859
5860 for (auto Inst : LoopInvariants) {
5861 unsigned Usage =
5862 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5863 unsigned ClassID =
5864 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5865 if (Invariant.find(ClassID) == Invariant.end())
5866 Invariant[ClassID] = Usage;
5867 else
5868 Invariant[ClassID] += Usage;
5869 }
5870
5871 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5872 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5873 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5874 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5875 for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5876 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5877 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5878 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5879 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5880 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5881 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5882 for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5883 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5884 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5885 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5886 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5887 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
;
5888
5889 RU.LoopInvariantRegs = Invariant;
5890 RU.MaxLocalUsers = MaxUsages[i];
5891 RUs[i] = RU;
5892 }
5893
5894 return RUs;
5895}
5896
5897bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5898 // TODO: Cost model for emulated masked load/store is completely
5899 // broken. This hack guides the cost model to use an artificially
5900 // high enough value to practically disable vectorization with such
5901 // operations, except where previously deployed legality hack allowed
5902 // using very low cost values. This is to avoid regressions coming simply
5903 // from moving "masked load/store" check from legality to cost model.
5904 // Masked Load/Gather emulation was previously never allowed.
5905 // Limited number of Masked Store/Scatter emulation was allowed.
5906 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction")((isPredicatedInst(I) && "Expecting a scalar emulated instruction"
) ? static_cast<void> (0) : __assert_fail ("isPredicatedInst(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5906, __PRETTY_FUNCTION__))
;
5907 return isa<LoadInst>(I) ||
5908 (isa<StoreInst>(I) &&
5909 NumPredStores > NumberOfStoresToPredicate);
5910}
5911
5912void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5913 // If we aren't vectorizing the loop, or if we've already collected the
5914 // instructions to scalarize, there's nothing to do. Collection may already
5915 // have occurred if we have a user-selected VF and are now computing the
5916 // expected cost for interleaving.
5917 if (VF.isScalar() || VF.isZero() ||
5918 InstsToScalarize.find(VF) != InstsToScalarize.end())
5919 return;
5920
5921 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5922 // not profitable to scalarize any instructions, the presence of VF in the
5923 // map will indicate that we've analyzed it already.
5924 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5925
5926 // Find all the instructions that are scalar with predication in the loop and
5927 // determine if it would be better to not if-convert the blocks they are in.
5928 // If so, we also record the instructions to scalarize.
5929 for (BasicBlock *BB : TheLoop->blocks()) {
5930 if (!blockNeedsPredication(BB))
5931 continue;
5932 for (Instruction &I : *BB)
5933 if (isScalarWithPredication(&I)) {
5934 ScalarCostsTy ScalarCosts;
5935 // Do not apply discount logic if hacked cost is needed
5936 // for emulated masked memrefs.
5937 if (!useEmulatedMaskMemRefHack(&I) &&
5938 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5939 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5940 // Remember that BB will remain after vectorization.
5941 PredicatedBBsAfterVectorization.insert(BB);
5942 }
5943 }
5944}
5945
5946int LoopVectorizationCostModel::computePredInstDiscount(
5947 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5948 ElementCount VF) {
5949 assert(!isUniformAfterVectorization(PredInst, VF) &&((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5950, __PRETTY_FUNCTION__))
5950 "Instruction marked uniform-after-vectorization will be predicated")((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5950, __PRETTY_FUNCTION__))
;
5951
5952 // Initialize the discount to zero, meaning that the scalar version and the
5953 // vector version cost the same.
5954 int Discount = 0;
5955
5956 // Holds instructions to analyze. The instructions we visit are mapped in
5957 // ScalarCosts. Those instructions are the ones that would be scalarized if
5958 // we find that the scalar version costs less.
5959 SmallVector<Instruction *, 8> Worklist;
5960
5961 // Returns true if the given instruction can be scalarized.
5962 auto canBeScalarized = [&](Instruction *I) -> bool {
5963 // We only attempt to scalarize instructions forming a single-use chain
5964 // from the original predicated block that would otherwise be vectorized.
5965 // Although not strictly necessary, we give up on instructions we know will
5966 // already be scalar to avoid traversing chains that are unlikely to be
5967 // beneficial.
5968 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5969 isScalarAfterVectorization(I, VF))
5970 return false;
5971
5972 // If the instruction is scalar with predication, it will be analyzed
5973 // separately. We ignore it within the context of PredInst.
5974 if (isScalarWithPredication(I))
5975 return false;
5976
5977 // If any of the instruction's operands are uniform after vectorization,
5978 // the instruction cannot be scalarized. This prevents, for example, a
5979 // masked load from being scalarized.
5980 //
5981 // We assume we will only emit a value for lane zero of an instruction
5982 // marked uniform after vectorization, rather than VF identical values.
5983 // Thus, if we scalarize an instruction that uses a uniform, we would
5984 // create uses of values corresponding to the lanes we aren't emitting code
5985 // for. This behavior can be changed by allowing getScalarValue to clone
5986 // the lane zero values for uniforms rather than asserting.
5987 for (Use &U : I->operands())
5988 if (auto *J = dyn_cast<Instruction>(U.get()))
5989 if (isUniformAfterVectorization(J, VF))
5990 return false;
5991
5992 // Otherwise, we can scalarize the instruction.
5993 return true;
5994 };
5995
5996 // Compute the expected cost discount from scalarizing the entire expression
5997 // feeding the predicated instruction. We currently only consider expressions
5998 // that are single-use instruction chains.
5999 Worklist.push_back(PredInst);
6000 while (!Worklist.empty()) {
6001 Instruction *I = Worklist.pop_back_val();
6002
6003 // If we've already analyzed the instruction, there's nothing to do.
6004 if (ScalarCosts.find(I) != ScalarCosts.end())
6005 continue;
6006
6007 // Compute the cost of the vector instruction. Note that this cost already
6008 // includes the scalarization overhead of the predicated instruction.
6009 unsigned VectorCost = getInstructionCost(I, VF).first;
6010
6011 // Compute the cost of the scalarized instruction. This cost is the cost of
6012 // the instruction as if it wasn't if-converted and instead remained in the
6013 // predicated block. We will scale this cost by block probability after
6014 // computing the scalarization overhead.
6015 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6015, __PRETTY_FUNCTION__))
;
6016 unsigned ScalarCost =
6017 VF.getKnownMinValue() *
6018 getInstructionCost(I, ElementCount::getFixed(1)).first;
6019
6020 // Compute the scalarization overhead of needed insertelement instructions
6021 // and phi nodes.
6022 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6023 ScalarCost += TTI.getScalarizationOverhead(
6024 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6025 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6026 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6026, __PRETTY_FUNCTION__))
;
6027 ScalarCost +=
6028 VF.getKnownMinValue() *
6029 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6030 }
6031
6032 // Compute the scalarization overhead of needed extractelement
6033 // instructions. For each of the instruction's operands, if the operand can
6034 // be scalarized, add it to the worklist; otherwise, account for the
6035 // overhead.
6036 for (Use &U : I->operands())
6037 if (auto *J = dyn_cast<Instruction>(U.get())) {
6038 assert(VectorType::isValidElementType(J->getType()) &&((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6039, __PRETTY_FUNCTION__))
6039 "Instruction has non-scalar type")((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6039, __PRETTY_FUNCTION__))
;
6040 if (canBeScalarized(J))
6041 Worklist.push_back(J);
6042 else if (needsExtract(J, VF)) {
6043 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6043, __PRETTY_FUNCTION__))
;
6044 ScalarCost += TTI.getScalarizationOverhead(
6045 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6046 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6047 }
6048 }
6049
6050 // Scale the total scalar cost by block probability.
6051 ScalarCost /= getReciprocalPredBlockProb();
6052
6053 // Compute the discount. A non-negative discount means the vector version
6054 // of the instruction costs more, and scalarizing would be beneficial.
6055 Discount += VectorCost - ScalarCost;
6056 ScalarCosts[I] = ScalarCost;
6057 }
6058
6059 return Discount;
6060}
6061
6062LoopVectorizationCostModel::VectorizationCostTy
6063LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6064 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6064, __PRETTY_FUNCTION__))
;
6065 VectorizationCostTy Cost;
6066
6067 // For each block.
6068 for (BasicBlock *BB : TheLoop->blocks()) {
6069 VectorizationCostTy BlockCost;
6070
6071 // For each instruction in the old loop.
6072 for (Instruction &I : BB->instructionsWithoutDebug()) {
6073 // Skip ignored values.
6074 if (ValuesToIgnore.count(&I) ||
6075 (VF.isVector() && VecValuesToIgnore.count(&I)))
6076 continue;
6077
6078 VectorizationCostTy C = getInstructionCost(&I, VF);
6079
6080 // Check if we should override the cost.
6081 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6082 C.first = ForceTargetInstructionCost;
6083
6084 BlockCost.first += C.first;
6085 BlockCost.second |= C.second;
6086 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6087 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6088 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
6089 }
6090
6091 // If we are vectorizing a predicated block, it will have been
6092 // if-converted. This means that the block's instructions (aside from
6093 // stores and instructions that may divide by zero) will now be
6094 // unconditionally executed. For the scalar case, we may not always execute
6095 // the predicated block. Thus, scale the block's cost by the probability of
6096 // executing it.
6097 if (VF.isScalar() && blockNeedsPredication(BB))
6098 BlockCost.first /= getReciprocalPredBlockProb();
6099
6100 Cost.first += BlockCost.first;
6101 Cost.second |= BlockCost.second;
6102 }
6103
6104 return Cost;
6105}
6106
6107/// Gets Address Access SCEV after verifying that the access pattern
6108/// is loop invariant except the induction variable dependence.
6109///
6110/// This SCEV can be sent to the Target in order to estimate the address
6111/// calculation cost.
6112static const SCEV *getAddressAccessSCEV(
6113 Value *Ptr,
6114 LoopVectorizationLegality *Legal,
6115 PredicatedScalarEvolution &PSE,
6116 const Loop *TheLoop) {
6117
6118 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6119 if (!Gep)
6120 return nullptr;
6121
6122 // We are looking for a gep with all loop invariant indices except for one
6123 // which should be an induction variable.
6124 auto SE = PSE.getSE();
6125 unsigned NumOperands = Gep->getNumOperands();
6126 for (unsigned i = 1; i < NumOperands; ++i) {
6127 Value *Opd = Gep->getOperand(i);
6128 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6129 !Legal->isInductionVariable(Opd))
6130 return nullptr;
6131 }
6132
6133 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6134 return PSE.getSCEV(Ptr);
6135}
6136
6137static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6138 return Legal->hasStride(I->getOperand(0)) ||
6139 Legal->hasStride(I->getOperand(1));
6140}
6141
6142unsigned
6143LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6144 ElementCount VF) {
6145 assert(VF.isVector() &&((VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6146, __PRETTY_FUNCTION__))
6146 "Scalarization cost of instruction implies vectorization.")((VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6146, __PRETTY_FUNCTION__))
;
6147 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6147, __PRETTY_FUNCTION__))
;
6148 Type *ValTy = getMemInstValueType(I);
6149 auto SE = PSE.getSE();
6150
6151 unsigned AS = getLoadStoreAddressSpace(I);
6152 Value *Ptr = getLoadStorePointerOperand(I);
6153 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6154
6155 // Figure out whether the access is strided and get the stride value
6156 // if it's known in compile time
6157 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6158
6159 // Get the cost of the scalar memory instruction and address computation.
6160 unsigned Cost =
6161 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6162
6163 // Don't pass *I here, since it is scalar but will actually be part of a
6164 // vectorized loop where the user of it is a vectorized instruction.
6165 const Align Alignment = getLoadStoreAlignment(I);
6166 Cost += VF.getKnownMinValue() *
6167 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6168 AS, TTI::TCK_RecipThroughput);
6169
6170 // Get the overhead of the extractelement and insertelement instructions
6171 // we might create due to scalarization.
6172 Cost += getScalarizationOverhead(I, VF);
6173
6174 // If we have a predicated store, it may not be executed for each vector
6175 // lane. Scale the cost by the probability of executing the predicated
6176 // block.
6177 if (isPredicatedInst(I)) {
6178 Cost /= getReciprocalPredBlockProb();
6179
6180 if (useEmulatedMaskMemRefHack(I))
6181 // Artificially setting to a high enough value to practically disable
6182 // vectorization with such operations.
6183 Cost = 3000000;
6184 }
6185
6186 return Cost;
6187}
6188
6189unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6190 ElementCount VF) {
6191 Type *ValTy = getMemInstValueType(I);
6192 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6193 Value *Ptr = getLoadStorePointerOperand(I);
6194 unsigned AS = getLoadStoreAddressSpace(I);
6195 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6196 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6197
6198 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6199, __PRETTY_FUNCTION__))
6199 "Stride should be 1 or -1 for consecutive memory access")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6199, __PRETTY_FUNCTION__))
;
6200 const Align Alignment = getLoadStoreAlignment(I);
6201 unsigned Cost = 0;
6202 if (Legal->isMaskRequired(I))
6203 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6204 CostKind);
6205 else
6206 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6207 CostKind, I);
6208
6209 bool Reverse = ConsecutiveStride < 0;
6210 if (Reverse)
6211 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6212 return Cost;
6213}
6214
6215unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6216 ElementCount VF) {
6217 Type *ValTy = getMemInstValueType(I);
6218 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6219 const Align Alignment = getLoadStoreAlignment(I);
6220 unsigned AS = getLoadStoreAddressSpace(I);
6221 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6222 if (isa<LoadInst>(I)) {
6223 return TTI.getAddressComputationCost(ValTy) +
6224 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6225 CostKind) +
6226 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6227 }
6228 StoreInst *SI = cast<StoreInst>(I);
6229
6230 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6231 return TTI.getAddressComputationCost(ValTy) +
6232 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6233 CostKind) +
6234 (isLoopInvariantStoreValue
6235 ? 0
6236 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6237 VF.getKnownMinValue() - 1));
6238}
6239
6240unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6241 ElementCount VF) {
6242 Type *ValTy = getMemInstValueType(I);
6243 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6244 const Align Alignment = getLoadStoreAlignment(I);
6245 const Value *Ptr = getLoadStorePointerOperand(I);
6246
6247 return TTI.getAddressComputationCost(VectorTy) +
6248 TTI.getGatherScatterOpCost(
6249 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6250 TargetTransformInfo::TCK_RecipThroughput, I);
6251}
6252
6253unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6254 ElementCount VF) {
6255 Type *ValTy = getMemInstValueType(I);
6256 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6257 unsigned AS = getLoadStoreAddressSpace(I);
6258
6259 auto Group = getInterleavedAccessGroup(I);
6260 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6260, __PRETTY_FUNCTION__))
;
6261
6262 unsigned InterleaveFactor = Group->getFactor();
6263 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6263, __PRETTY_FUNCTION__))
;
6264 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6265
6266 // Holds the indices of existing members in an interleaved load group.
6267 // An interleaved store group doesn't need this as it doesn't allow gaps.
6268 SmallVector<unsigned, 4> Indices;
6269 if (isa<LoadInst>(I)) {
6270 for (unsigned i = 0; i < InterleaveFactor; i++)
6271 if (Group->getMember(i))
6272 Indices.push_back(i);
6273 }
6274
6275 // Calculate the cost of the whole interleaved group.
6276 bool UseMaskForGaps =
6277 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6278 unsigned Cost = TTI.getInterleavedMemoryOpCost(
6279 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6280 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6281
6282 if (Group->isReverse()) {
6283 // TODO: Add support for reversed masked interleaved access.
6284 assert(!Legal->isMaskRequired(I) &&((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6285, __PRETTY_FUNCTION__))
6285 "Reverse masked interleaved access not supported.")((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6285, __PRETTY_FUNCTION__))
;
6286 Cost += Group->getNumMembers() *
6287 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6288 }
6289 return Cost;
6290}
6291
6292unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6293 ElementCount VF) {
6294 // Calculate scalar cost only. Vectorization cost should be ready at this
6295 // moment.
6296 if (VF.isScalar()) {
6297 Type *ValTy = getMemInstValueType(I);
6298 const Align Alignment = getLoadStoreAlignment(I);
6299 unsigned AS = getLoadStoreAddressSpace(I);
6300
6301 return TTI.getAddressComputationCost(ValTy) +
6302 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6303 TTI::TCK_RecipThroughput, I);
6304 }
6305 return getWideningCost(I, VF);
6306}
6307
6308LoopVectorizationCostModel::VectorizationCostTy
6309LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6310 ElementCount VF) {
6311 assert(!VF.isScalable() &&((!VF.isScalable() && "the cost model is not yet implemented for scalable vectorization"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the cost model is not yet implemented for scalable vectorization\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6312, __PRETTY_FUNCTION__))
6312 "the cost model is not yet implemented for scalable vectorization")((!VF.isScalable() && "the cost model is not yet implemented for scalable vectorization"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the cost model is not yet implemented for scalable vectorization\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6312, __PRETTY_FUNCTION__))
;
6313 // If we know that this instruction will remain uniform, check the cost of
6314 // the scalar version.
6315 if (isUniformAfterVectorization(I, VF))
6316 VF = ElementCount::getFixed(1);
6317
6318 if (VF.isVector() && isProfitableToScalarize(I, VF))
6319 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6320
6321 // Forced scalars do not have any scalarization overhead.
6322 auto ForcedScalar = ForcedScalars.find(VF);
6323 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6324 auto InstSet = ForcedScalar->second;
6325 if (InstSet.count(I))
6326 return VectorizationCostTy(
6327 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6328 VF.getKnownMinValue()),
6329 false);
6330 }
6331
6332 Type *VectorTy;
6333 unsigned C = getInstructionCost(I, VF, VectorTy);
6334
6335 bool TypeNotScalarized =
6336 VF.isVector() && VectorTy->isVectorTy() &&
6337 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6338 return VectorizationCostTy(C, TypeNotScalarized);
6339}
6340
6341unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6342 ElementCount VF) {
6343
6344 assert(!VF.isScalable() &&((!VF.isScalable() && "cannot compute scalarization overhead for scalable vectorization"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot compute scalarization overhead for scalable vectorization\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6345, __PRETTY_FUNCTION__))
6345 "cannot compute scalarization overhead for scalable vectorization")((!VF.isScalable() && "cannot compute scalarization overhead for scalable vectorization"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot compute scalarization overhead for scalable vectorization\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6345, __PRETTY_FUNCTION__))
;
6346 if (VF.isScalar())
6347 return 0;
6348
6349 unsigned Cost = 0;
6350 Type *RetTy = ToVectorTy(I->getType(), VF);
6351 if (!RetTy->isVoidTy() &&
6352 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6353 Cost += TTI.getScalarizationOverhead(
6354 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6355 true, false);
6356
6357 // Some targets keep addresses scalar.
6358 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6359 return Cost;
6360
6361 // Some targets support efficient element stores.
6362 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6363 return Cost;
6364
6365 // Collect operands to consider.
6366 CallInst *CI = dyn_cast<CallInst>(I);
6367 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6368
6369 // Skip operands that do not require extraction/scalarization and do not incur
6370 // any overhead.
6371 return Cost + TTI.getOperandsScalarizationOverhead(
6372 filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6373}
6374
6375void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6376 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6376, __PRETTY_FUNCTION__))
;
6377 if (VF.isScalar())
6378 return;
6379 NumPredStores = 0;
6380 for (BasicBlock *BB : TheLoop->blocks()) {
6381 // For each instruction in the old loop.
6382 for (Instruction &I : *BB) {
6383 Value *Ptr = getLoadStorePointerOperand(&I);
6384 if (!Ptr)
6385 continue;
6386
6387 // TODO: We should generate better code and update the cost model for
6388 // predicated uniform stores. Today they are treated as any other
6389 // predicated store (see added test cases in
6390 // invariant-store-vectorization.ll).
6391 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6392 NumPredStores++;
6393
6394 if (Legal->isUniform(Ptr) &&
6395 // Conditional loads and stores should be scalarized and predicated.
6396 // isScalarWithPredication cannot be used here since masked
6397 // gather/scatters are not considered scalar with predication.
6398 !Legal->blockNeedsPredication(I.getParent())) {
6399 // TODO: Avoid replicating loads and stores instead of
6400 // relying on instcombine to remove them.
6401 // Load: Scalar load + broadcast
6402 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6403 unsigned Cost = getUniformMemOpCost(&I, VF);
6404 setWideningDecision(&I, VF, CM_Scalarize, Cost);
6405 continue;
6406 }
6407
6408 // We assume that widening is the best solution when possible.
6409 if (memoryInstructionCanBeWidened(&I, VF)) {
6410 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6411 int ConsecutiveStride =
6412 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6413 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6414, __PRETTY_FUNCTION__))
6414 "Expected consecutive stride.")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6414, __PRETTY_FUNCTION__))
;
6415 InstWidening Decision =
6416 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6417 setWideningDecision(&I, VF, Decision, Cost);
6418 continue;
6419 }
6420
6421 // Choose between Interleaving, Gather/Scatter or Scalarization.
6422 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6423 unsigned NumAccesses = 1;
6424 if (isAccessInterleaved(&I)) {
6425 auto Group = getInterleavedAccessGroup(&I);
6426 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6426, __PRETTY_FUNCTION__))
;
6427
6428 // Make one decision for the whole group.
6429 if (getWideningDecision(&I, VF) != CM_Unknown)
6430 continue;
6431
6432 NumAccesses = Group->getNumMembers();
6433 if (interleavedAccessCanBeWidened(&I, VF))
6434 InterleaveCost = getInterleaveGroupCost(&I, VF);
6435 }
6436
6437 unsigned GatherScatterCost =
6438 isLegalGatherOrScatter(&I)
6439 ? getGatherScatterCost(&I, VF) * NumAccesses
6440 : std::numeric_limits<unsigned>::max();
6441
6442 unsigned ScalarizationCost =
6443 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6444
6445 // Choose better solution for the current VF,
6446 // write down this decision and use it during vectorization.
6447 unsigned Cost;
6448 InstWidening Decision;
6449 if (InterleaveCost <= GatherScatterCost &&
6450 InterleaveCost < ScalarizationCost) {
6451 Decision = CM_Interleave;
6452 Cost = InterleaveCost;
6453 } else if (GatherScatterCost < ScalarizationCost) {
6454 Decision = CM_GatherScatter;
6455 Cost = GatherScatterCost;
6456 } else {
6457 Decision = CM_Scalarize;
6458 Cost = ScalarizationCost;
6459 }
6460 // If the instructions belongs to an interleave group, the whole group
6461 // receives the same decision. The whole group receives the cost, but
6462 // the cost will actually be assigned to one instruction.
6463 if (auto Group = getInterleavedAccessGroup(&I))
6464 setWideningDecision(Group, VF, Decision, Cost);
6465 else
6466 setWideningDecision(&I, VF, Decision, Cost);
6467 }
6468 }
6469
6470 // Make sure that any load of address and any other address computation
6471 // remains scalar unless there is gather/scatter support. This avoids
6472 // inevitable extracts into address registers, and also has the benefit of
6473 // activating LSR more, since that pass can't optimize vectorized
6474 // addresses.
6475 if (TTI.prefersVectorizedAddressing())
6476 return;
6477
6478 // Start with all scalar pointer uses.
6479 SmallPtrSet<Instruction *, 8> AddrDefs;
6480 for (BasicBlock *BB : TheLoop->blocks())
6481 for (Instruction &I : *BB) {
6482 Instruction *PtrDef =
6483 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6484 if (PtrDef && TheLoop->contains(PtrDef) &&
6485 getWideningDecision(&I, VF) != CM_GatherScatter)
6486 AddrDefs.insert(PtrDef);
6487 }
6488
6489 // Add all instructions used to generate the addresses.
6490 SmallVector<Instruction *, 4> Worklist;
6491 for (auto *I : AddrDefs)
6492 Worklist.push_back(I);
6493 while (!Worklist.empty()) {
6494 Instruction *I = Worklist.pop_back_val();
6495 for (auto &Op : I->operands())
6496 if (auto *InstOp = dyn_cast<Instruction>(Op))
6497 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6498 AddrDefs.insert(InstOp).second)
6499 Worklist.push_back(InstOp);
6500 }
6501
6502 for (auto *I : AddrDefs) {
6503 if (isa<LoadInst>(I)) {
6504 // Setting the desired widening decision should ideally be handled in
6505 // by cost functions, but since this involves the task of finding out
6506 // if the loaded register is involved in an address computation, it is
6507 // instead changed here when we know this is the case.
6508 InstWidening Decision = getWideningDecision(I, VF);
6509 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6510 // Scalarize a widened load of address.
6511 setWideningDecision(
6512 I, VF, CM_Scalarize,
6513 (VF.getKnownMinValue() *
6514 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6515 else if (auto Group = getInterleavedAccessGroup(I)) {
6516 // Scalarize an interleave group of address loads.
6517 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6518 if (Instruction *Member = Group->getMember(I))
6519 setWideningDecision(
6520 Member, VF, CM_Scalarize,
6521 (VF.getKnownMinValue() *
6522 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6523 }
6524 }
6525 } else
6526 // Make sure I gets scalarized and a cost estimate without
6527 // scalarization overhead.
6528 ForcedScalars[VF].insert(I);
6529 }
6530}
6531
6532unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6533 ElementCount VF,
6534 Type *&VectorTy) {
6535 Type *RetTy = I->getType();
6536 if (canTruncateToMinimalBitwidth(I, VF))
6537 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6538 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6539 auto SE = PSE.getSE();
6540 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6541
6542 // TODO: We need to estimate the cost of intrinsic calls.
6543 switch (I->getOpcode()) {
6544 case Instruction::GetElementPtr:
6545 // We mark this instruction as zero-cost because the cost of GEPs in
6546 // vectorized code depends on whether the corresponding memory instruction
6547 // is scalarized or not. Therefore, we handle GEPs with the memory
6548 // instruction cost.
6549 return 0;
6550 case Instruction::Br: {
6551 // In cases of scalarized and predicated instructions, there will be VF
6552 // predicated blocks in the vectorized loop. Each branch around these
6553 // blocks requires also an extract of its vector compare i1 element.
6554 bool ScalarPredicatedBB = false;
6555 BranchInst *BI = cast<BranchInst>(I);
6556 if (VF.isVector() && BI->isConditional() &&
6557 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6558 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6559 ScalarPredicatedBB = true;
6560
6561 if (ScalarPredicatedBB) {
6562 // Return cost for branches around scalarized and predicated blocks.
6563 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6563, __PRETTY_FUNCTION__))
;
6564 auto *Vec_i1Ty =
6565 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6566 return (TTI.getScalarizationOverhead(
6567 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6568 false, true) +
6569 (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6570 VF.getKnownMinValue()));
6571 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6572 // The back-edge branch will remain, as will all scalar branches.
6573 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6574 else
6575 // This branch will be eliminated by if-conversion.
6576 return 0;
6577 // Note: We currently assume zero cost for an unconditional branch inside
6578 // a predicated block since it will become a fall-through, although we
6579 // may decide in the future to call TTI for all branches.
6580 }
6581 case Instruction::PHI: {
6582 auto *Phi = cast<PHINode>(I);
6583
6584 // First-order recurrences are replaced by vector shuffles inside the loop.
6585 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6586 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6587 return TTI.getShuffleCost(
6588 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6589 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6590
6591 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6592 // converted into select instructions. We require N - 1 selects per phi
6593 // node, where N is the number of incoming values.
6594 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6595 return (Phi->getNumIncomingValues() - 1) *
6596 TTI.getCmpSelInstrCost(
6597 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6598 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6599 CostKind);
6600
6601 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6602 }
6603 case Instruction::UDiv:
6604 case Instruction::SDiv:
6605 case Instruction::URem:
6606 case Instruction::SRem:
6607 // If we have a predicated instruction, it may not be executed for each
6608 // vector lane. Get the scalarization cost and scale this amount by the
6609 // probability of executing the predicated block. If the instruction is not
6610 // predicated, we fall through to the next case.
6611 if (VF.isVector() && isScalarWithPredication(I)) {
6612 unsigned Cost = 0;
6613
6614 // These instructions have a non-void type, so account for the phi nodes
6615 // that we will create. This cost is likely to be zero. The phi node
6616 // cost, if any, should be scaled by the block probability because it
6617 // models a copy at the end of each predicated block.
6618 Cost += VF.getKnownMinValue() *
6619 TTI.getCFInstrCost(Instruction::PHI, CostKind);
6620
6621 // The cost of the non-predicated instruction.
6622 Cost += VF.getKnownMinValue() *
6623 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6624
6625 // The cost of insertelement and extractelement instructions needed for
6626 // scalarization.
6627 Cost += getScalarizationOverhead(I, VF);
6628
6629 // Scale the cost by the probability of executing the predicated blocks.
6630 // This assumes the predicated block for each vector lane is equally
6631 // likely.
6632 return Cost / getReciprocalPredBlockProb();
6633 }
6634 LLVM_FALLTHROUGH[[gnu::fallthrough]];
6635 case Instruction::Add:
6636 case Instruction::FAdd:
6637 case Instruction::Sub:
6638 case Instruction::FSub:
6639 case Instruction::Mul:
6640 case Instruction::FMul:
6641 case Instruction::FDiv:
6642 case Instruction::FRem:
6643 case Instruction::Shl:
6644 case Instruction::LShr:
6645 case Instruction::AShr:
6646 case Instruction::And:
6647 case Instruction::Or:
6648 case Instruction::Xor: {
6649 // Since we will replace the stride by 1 the multiplication should go away.
6650 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6651 return 0;
6652 // Certain instructions can be cheaper to vectorize if they have a constant
6653 // second vector operand. One example of this are shifts on x86.
6654 Value *Op2 = I->getOperand(1);
6655 TargetTransformInfo::OperandValueProperties Op2VP;
6656 TargetTransformInfo::OperandValueKind Op2VK =
6657 TTI.getOperandInfo(Op2, Op2VP);
6658 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6659 Op2VK = TargetTransformInfo::OK_UniformValue;
6660
6661 SmallVector<const Value *, 4> Operands(I->operand_values());
6662 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6663 return N * TTI.getArithmeticInstrCost(
6664 I->getOpcode(), VectorTy, CostKind,
6665 TargetTransformInfo::OK_AnyValue,
6666 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6667 }
6668 case Instruction::FNeg: {
6669 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6669, __PRETTY_FUNCTION__))
;
6670 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6671 return N * TTI.getArithmeticInstrCost(
6672 I->getOpcode(), VectorTy, CostKind,
6673 TargetTransformInfo::OK_AnyValue,
6674 TargetTransformInfo::OK_AnyValue,
6675 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6676 I->getOperand(0), I);
6677 }
6678 case Instruction::Select: {
6679 SelectInst *SI = cast<SelectInst>(I);
6680 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6681 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6682 Type *CondTy = SI->getCondition()->getType();
6683 if (!ScalarCond) {
6684 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6684, __PRETTY_FUNCTION__))
;
6685 CondTy = VectorType::get(CondTy, VF);
6686 }
6687 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6688 CostKind, I);
6689 }
6690 case Instruction::ICmp:
6691 case Instruction::FCmp: {
6692 Type *ValTy = I->getOperand(0)->getType();
6693 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6694 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6695 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6696 VectorTy = ToVectorTy(ValTy, VF);
6697 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6698 I);
6699 }
6700 case Instruction::Store:
6701 case Instruction::Load: {
6702 ElementCount Width = VF;
6703 if (Width.isVector()) {
6704 InstWidening Decision = getWideningDecision(I, Width);
6705 assert(Decision != CM_Unknown &&((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6706, __PRETTY_FUNCTION__))
6706 "CM decision should be taken at this point")((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6706, __PRETTY_FUNCTION__))
;
6707 if (Decision == CM_Scalarize)
6708 Width = ElementCount::getFixed(1);
6709 }
6710 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6711 return getMemoryInstructionCost(I, VF);
6712 }
6713 case Instruction::ZExt:
6714 case Instruction::SExt:
6715 case Instruction::FPToUI:
6716 case Instruction::FPToSI:
6717 case Instruction::FPExt:
6718 case Instruction::PtrToInt:
6719 case Instruction::IntToPtr:
6720 case Instruction::SIToFP:
6721 case Instruction::UIToFP:
6722 case Instruction::Trunc:
6723 case Instruction::FPTrunc:
6724 case Instruction::BitCast: {
6725 // Computes the CastContextHint from a Load/Store instruction.
6726 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6727 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected a load or a store!") ? static_cast<void> (0)
: __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6728, __PRETTY_FUNCTION__))
6728 "Expected a load or a store!")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected a load or a store!") ? static_cast<void> (0)
: __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6728, __PRETTY_FUNCTION__))
;
6729
6730 if (VF.isScalar() || !TheLoop->contains(I))
6731 return TTI::CastContextHint::Normal;
6732
6733 switch (getWideningDecision(I, VF)) {
6734 case LoopVectorizationCostModel::CM_GatherScatter:
6735 return TTI::CastContextHint::GatherScatter;
6736 case LoopVectorizationCostModel::CM_Interleave:
6737 return TTI::CastContextHint::Interleave;
6738 case LoopVectorizationCostModel::CM_Scalarize:
6739 case LoopVectorizationCostModel::CM_Widen:
6740 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6741 : TTI::CastContextHint::Normal;
6742 case LoopVectorizationCostModel::CM_Widen_Reverse:
6743 return TTI::CastContextHint::Reversed;
6744 case LoopVectorizationCostModel::CM_Unknown:
6745 llvm_unreachable("Instr did not go through cost modelling?")::llvm::llvm_unreachable_internal("Instr did not go through cost modelling?"
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6745)
;
6746 }
6747
6748 llvm_unreachable("Unhandled case!")::llvm::llvm_unreachable_internal("Unhandled case!", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6748)
;
6749 };
6750
6751 unsigned Opcode = I->getOpcode();
6752 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6753 // For Trunc, the context is the only user, which must be a StoreInst.
6754 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6755 if (I->hasOneUse())
6756 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6757 CCH = ComputeCCH(Store);
6758 }
6759 // For Z/Sext, the context is the operand, which must be a LoadInst.
6760 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6761 Opcode == Instruction::FPExt) {
6762 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6763 CCH = ComputeCCH(Load);
6764 }
6765
6766 // We optimize the truncation of induction variables having constant
6767 // integer steps. The cost of these truncations is the same as the scalar
6768 // operation.
6769 if (isOptimizableIVTruncate(I, VF)) {
6770 auto *Trunc = cast<TruncInst>(I);
6771 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6772 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6773 }
6774
6775 Type *SrcScalarTy = I->getOperand(0)->getType();
6776 Type *SrcVecTy =
6777 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6778 if (canTruncateToMinimalBitwidth(I, VF)) {
6779 // This cast is going to be shrunk. This may remove the cast or it might
6780 // turn it into slightly different cast. For example, if MinBW == 16,
6781 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6782 //
6783 // Calculate the modified src and dest types.
6784 Type *MinVecTy = VectorTy;
6785 if (Opcode == Instruction::Trunc) {
6786 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6787 VectorTy =
6788 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6789 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6790 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6791 VectorTy =
6792 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6793 }
6794 }
6795
6796 assert(!VF.isScalable() && "VF is assumed to be non scalable")((!VF.isScalable() && "VF is assumed to be non scalable"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6796, __PRETTY_FUNCTION__))
;
6797 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6798 return N *
6799 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6800 }
6801 case Instruction::Call: {
6802 bool NeedToScalarize;
6803 CallInst *CI = cast<CallInst>(I);
6804 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6805 if (getVectorIntrinsicIDForCall(CI, TLI))
6806 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6807 return CallCost;
6808 }
6809 default:
6810 // The cost of executing VF copies of the scalar instruction. This opcode
6811 // is unknown. Assume that it is the same as 'mul'.
6812 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6813 Instruction::Mul, VectorTy, CostKind) +
6814 getScalarizationOverhead(I, VF);
6815 } // end of switch.
6816}
6817
6818char LoopVectorize::ID = 0;
6819
6820static const char lv_name[] = "Loop Vectorization";
6821
6822INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
6823INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6824INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
6825INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6826INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
6827INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6828INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
6829INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6830INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6831INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
6832INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
6833INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6834INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6835INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
6836INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
6837INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
6838
6839namespace llvm {
6840
6841Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6842
6843Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6844 bool VectorizeOnlyWhenForced) {
6845 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6846}
6847
6848} // end namespace llvm
6849
6850bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6851 // Check if the pointer operand of a load or store instruction is
6852 // consecutive.
6853 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6854 return Legal->isConsecutivePtr(Ptr);
6855 return false;
6856}
6857
6858void LoopVectorizationCostModel::collectValuesToIgnore() {
6859 // Ignore ephemeral values.
6860 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6861
6862 // Ignore type-promoting instructions we identified during reduction
6863 // detection.
6864 for (auto &Reduction : Legal->getReductionVars()) {
6865 RecurrenceDescriptor &RedDes = Reduction.second;
6866 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6867 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6868 }
6869 // Ignore type-casting instructions we identified during induction
6870 // detection.
6871 for (auto &Induction : Legal->getInductionVars()) {
6872 InductionDescriptor &IndDes = Induction.second;
6873 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6874 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6875 }
6876}
6877
6878void LoopVectorizationCostModel::collectInLoopReductions() {
6879 // For the moment, without predicated reduction instructions, we do not
6880 // support inloop reductions whilst folding the tail, and hence in those cases
6881 // all reductions are currently out of the loop.
6882 if (foldTailByMasking())
6883 return;
6884
6885 for (auto &Reduction : Legal->getReductionVars()) {
6886 PHINode *Phi = Reduction.first;
6887 RecurrenceDescriptor &RdxDesc = Reduction.second;
6888
6889 // We don't collect reductions that are type promoted (yet).
6890 if (RdxDesc.getRecurrenceType() != Phi->getType())
6891 continue;
6892
6893 // If the target would prefer this reduction to happen "in-loop", then we
6894 // want to record it as such.
6895 unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind());
6896 if (!PreferInLoopReductions &&
6897 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6898 TargetTransformInfo::ReductionFlags()))
6899 continue;
6900
6901 // Check that we can correctly put the reductions into the loop, by
6902 // finding the chain of operations that leads from the phi to the loop
6903 // exit value.
6904 SmallVector<Instruction *, 4> ReductionOperations =
6905 RdxDesc.getReductionOpChain(Phi, TheLoop);
6906 bool InLoop = !ReductionOperations.empty();
6907 if (InLoop)
6908 InLoopReductionChains[Phi] = ReductionOperations;
6909 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
6910 << " reduction for phi: " << *Phi << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
;
6911 }
6912}
6913
6914// TODO: we could return a pair of values that specify the max VF and
6915// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6916// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6917// doesn't have a cost model that can choose which plan to execute if
6918// more than one is generated.
6919static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6920 LoopVectorizationCostModel &CM) {
6921 unsigned WidestType;
6922 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6923 return WidestVectorRegBits / WidestType;
6924}
6925
6926VectorizationFactor
6927LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6928 assert(!UserVF.isScalable() && "scalable vectors not yet supported")((!UserVF.isScalable() && "scalable vectors not yet supported"
) ? static_cast<void> (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectors not yet supported\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6928, __PRETTY_FUNCTION__))
;
6929 ElementCount VF = UserVF;
6930 // Outer loop handling: They may require CFG and instruction level
6931 // transformations before even evaluating whether vectorization is profitable.
6932 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6933 // the vectorization pipeline.
6934 if (!OrigLoop->isInnermost()) {
6935 // If the user doesn't provide a vectorization factor, determine a
6936 // reasonable one.
6937 if (UserVF.isZero()) {
6938 VF = ElementCount::getFixed(
6939 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6940 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
6941
6942 // Make sure we have a VF > 1 for stress testing.
6943 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6944 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
6945 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
6946 VF = ElementCount::getFixed(4);
6947 }
6948 }
6949 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6949, __PRETTY_FUNCTION__))
;
6950 assert(isPowerOf2_32(VF.getKnownMinValue()) &&((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6951, __PRETTY_FUNCTION__))
6951 "VF needs to be a power of two")((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6951, __PRETTY_FUNCTION__))
;
6952 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
6953 << "VF " << VF << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
;
6954 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
6955
6956 // For VPlan build stress testing, we bail out after VPlan construction.
6957 if (VPlanBuildStressTest)
6958 return VectorizationFactor::Disabled();
6959
6960 return {VF, 0 /*Cost*/};
6961 }
6962
6963 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6964 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6965 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
6966 return VectorizationFactor::Disabled();
6967}
6968
6969Optional<VectorizationFactor>
6970LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6971 assert(!UserVF.isScalable() && "scalable vectorization not yet handled")((!UserVF.isScalable() && "scalable vectorization not yet handled"
) ? static_cast<void> (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectorization not yet handled\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6971, __PRETTY_FUNCTION__))
;
6972 assert(OrigLoop->isInnermost() && "Inner loop expected.")((OrigLoop->isInnermost() && "Inner loop expected."
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6972, __PRETTY_FUNCTION__))
;
6973 Optional<unsigned> MaybeMaxVF =
6974 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
6975 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6976 return None;
6977
6978 // Invalidate interleave groups if all blocks of loop will be predicated.
6979 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6980 !useMaskedInterleavedAccesses(*TTI)) {
6981 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6982 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6983 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6984 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
6985 if (CM.InterleaveInfo.invalidateGroups())
6986 // Invalidating interleave groups also requires invalidating all decisions
6987 // based on them, which includes widening decisions and uniform and scalar
6988 // values.
6989 CM.invalidateCostModelingDecisions();
6990 }
6991
6992 if (!UserVF.isZero()) {
6993 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (false)
;
6994 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&((isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6995, __PRETTY_FUNCTION__))
6995 "VF needs to be a power of two")((isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6995, __PRETTY_FUNCTION__))
;
6996 // Collect the instructions (and their associated costs) that will be more
6997 // profitable to scalarize.
6998 CM.selectUserVectorizationFactor(UserVF);
6999 CM.collectInLoopReductions();
7000 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
7001 UserVF.getKnownMinValue());
7002 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7003 return {{UserVF, 0}};
7004 }
7005
7006 unsigned MaxVF = MaybeMaxVF.getValue();
7007 assert(MaxVF != 0 && "MaxVF is zero.")((MaxVF != 0 && "MaxVF is zero.") ? static_cast<void
> (0) : __assert_fail ("MaxVF != 0 && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7007, __PRETTY_FUNCTION__))
;
7008
7009 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
7010 // Collect Uniform and Scalar instructions after vectorization with VF.
7011 CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
7012
7013 // Collect the instructions (and their associated costs) that will be more
7014 // profitable to scalarize.
7015 if (VF > 1)
7016 CM.collectInstsToScalarize(ElementCount::getFixed(VF));
7017 }
7018
7019 CM.collectInLoopReductions();
7020
7021 buildVPlansWithVPRecipes(1, MaxVF);
7022 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7023 if (MaxVF == 1)
7024 return VectorizationFactor::Disabled();
7025
7026 // Select the optimal vectorization factor.
7027 return CM.selectVectorizationFactor(MaxVF);
7028}
7029
7030void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7031 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
7032 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
;
7033 BestVF = VF;
7034 BestUF = UF;
7035
7036 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7037 return !Plan->hasVF(VF);
7038 });
7039 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")((VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7039, __PRETTY_FUNCTION__))
;
7040}
7041
7042void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7043 DominatorTree *DT) {
7044 // Perform the actual loop transformation.
7045
7046 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7047 VPCallbackILV CallbackILV(ILV);
7048
7049 assert(BestVF.hasValue() && "Vectorization Factor is missing")((BestVF.hasValue() && "Vectorization Factor is missing"
) ? static_cast<void> (0) : __assert_fail ("BestVF.hasValue() && \"Vectorization Factor is missing\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7049, __PRETTY_FUNCTION__))
;
7050
7051 VPTransformState State{*BestVF, BestUF, LI,
7052 DT, ILV.Builder, ILV.VectorLoopValueMap,
7053 &ILV, CallbackILV};
7054 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7055 State.TripCount = ILV.getOrCreateTripCount(nullptr);
7056 State.CanonicalIV = ILV.Induction;
7057
7058 //===------------------------------------------------===//
7059 //
7060 // Notice: any optimization or new instruction that go
7061 // into the code below should also be implemented in
7062 // the cost-model.
7063 //
7064 //===------------------------------------------------===//
7065
7066 // 2. Copy and widen instructions from the old loop into the new loop.
7067 assert(VPlans.size() == 1 && "Not a single VPlan to execute.")((VPlans.size() == 1 && "Not a single VPlan to execute."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7067, __PRETTY_FUNCTION__))
;
7068 VPlans.front()->execute(&State);
7069
7070 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7071 // predication, updating analyses.
7072 ILV.fixVectorizedLoop();
7073}
7074
7075void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7076 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7077 BasicBlock *Latch = OrigLoop->getLoopLatch();
7078
7079 // We create new control-flow for the vectorized loop, so the original
7080 // condition will be dead after vectorization if it's only used by the
7081 // branch.
7082 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7083 if (Cmp && Cmp->hasOneUse())
7084 DeadInstructions.insert(Cmp);
7085
7086 // We create new "steps" for induction variable updates to which the original
7087 // induction variables map. An original update instruction will be dead if
7088 // all its users except the induction variable are dead.
7089 for (auto &Induction : Legal->getInductionVars()) {
7090 PHINode *Ind = Induction.first;
7091 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7092 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7093 return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7094 }))
7095 DeadInstructions.insert(IndUpdate);
7096
7097 // We record as "Dead" also the type-casting instructions we had identified
7098 // during induction analysis. We don't need any handling for them in the
7099 // vectorized loop because we have proven that, under a proper runtime
7100 // test guarding the vectorized loop, the value of the phi, and the casted
7101 // value of the phi, are the same. The last instruction in this casting chain
7102 // will get its scalar/vector/widened def from the scalar/vector/widened def
7103 // of the respective phi node. Any other casts in the induction def-use chain
7104 // have no other uses outside the phi update chain, and will be ignored.
7105 InductionDescriptor &IndDes = Induction.second;
7106 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7107 DeadInstructions.insert(Casts.begin(), Casts.end());
7108 }
7109}
7110
7111Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7112
7113Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7114
7115Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7116 Instruction::BinaryOps BinOp) {
7117 // When unrolling and the VF is 1, we only need to add a simple scalar.
7118 Type *Ty = Val->getType();
7119 assert(!Ty->isVectorTy() && "Val must be a scalar")((!Ty->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7119, __PRETTY_FUNCTION__))
;
7120
7121 if (Ty->isFloatingPointTy()) {
7122 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7123
7124 // Floating point operations had to be 'fast' to enable the unrolling.
7125 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7126 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7127 }
7128 Constant *C = ConstantInt::get(Ty, StartIdx);
7129 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7130}
7131
7132static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7133 SmallVector<Metadata *, 4> MDs;
7134 // Reserve first location for self reference to the LoopID metadata node.
7135 MDs.push_back(nullptr);
7136 bool IsUnrollMetadata = false;
7137 MDNode *LoopID = L->getLoopID();
7138 if (LoopID) {
7139 // First find existing loop unrolling disable metadata.
7140 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7141 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7142 if (MD) {
7143 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7144 IsUnrollMetadata =
7145 S && S->getString().startswith("llvm.loop.unroll.disable");
7146 }
7147 MDs.push_back(LoopID->getOperand(i));
7148 }
7149 }
7150
7151 if (!IsUnrollMetadata) {
7152 // Add runtime unroll disable metadata.
7153 LLVMContext &Context = L->getHeader()->getContext();
7154 SmallVector<Metadata *, 1> DisableOperands;
7155 DisableOperands.push_back(
7156 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7157 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7158 MDs.push_back(DisableNode);
7159 MDNode *NewLoopID = MDNode::get(Context, MDs);
7160 // Set operand 0 to refer to the loop id itself.
7161 NewLoopID->replaceOperandWith(0, NewLoopID);
7162 L->setLoopID(NewLoopID);
7163 }
7164}
7165
7166bool LoopVectorizationPlanner::getDecisionAndClampRange(
7167 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7168 assert(Range.End > Range.Start && "Trying to test an empty VF range.")((Range.End > Range.Start && "Trying to test an empty VF range."
) ? static_cast<void> (0) : __assert_fail ("Range.End > Range.Start && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7168, __PRETTY_FUNCTION__))
;
7169 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7170
7171 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7172 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7173 Range.End = TmpVF;
7174 break;
7175 }
7176
7177 return PredicateAtRangeStart;
7178}
7179
7180/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7181/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7182/// of VF's starting at a given VF and extending it as much as possible. Each
7183/// vectorization decision can potentially shorten this sub-range during
7184/// buildVPlan().
7185void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7186 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7187 VFRange SubRange = {VF, MaxVF + 1};
7188 VPlans.push_back(buildVPlan(SubRange));
7189 VF = SubRange.End;
7190 }
7191}
7192
7193VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7194 VPlanPtr &Plan) {
7195 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")((is_contained(predecessors(Dst), Src) && "Invalid edge"
) ? static_cast<void> (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7195, __PRETTY_FUNCTION__))
;
7196
7197 // Look for cached value.
7198 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7199 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7200 if (ECEntryIt != EdgeMaskCache.end())
7201 return ECEntryIt->second;
7202
7203 VPValue *SrcMask = createBlockInMask(Src, Plan);
7204
7205 // The terminator has to be a branch inst!
7206 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7207 assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7207, __PRETTY_FUNCTION__))
;
7208
7209 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7210 return EdgeMaskCache[Edge] = SrcMask;
7211
7212 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7213 assert(EdgeMask && "No Edge Mask found for condition")((EdgeMask && "No Edge Mask found for condition") ? static_cast
<void> (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7213, __PRETTY_FUNCTION__))
;
7214
7215 if (BI->getSuccessor(0) != Dst)
7216 EdgeMask = Builder.createNot(EdgeMask);
7217
7218 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7219 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7220
7221 return EdgeMaskCache[Edge] = EdgeMask;
7222}
7223
7224VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7225 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7225, __PRETTY_FUNCTION__))
;
2
Assuming the condition is true
3
'?' condition is true
7226
7227 // Look for cached value.
7228 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7229 if (BCEntryIt != BlockMaskCache.end())
4
Assuming the condition is false
5
Taking false branch
7230 return BCEntryIt->second;
7231
7232 // All-one mask is modelled as no-mask following the convention for masked
7233 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7234 VPValue *BlockMask = nullptr;
7235
7236 if (OrigLoop->getHeader() == BB) {
6
Assuming the condition is false
7
Taking false branch
7237 if (!CM.blockNeedsPredication(BB))
7238 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7239
7240 // Introduce the early-exit compare IV <= BTC to form header block mask.
7241 // This is used instead of IV < TC because TC may wrap, unlike BTC.
7242 // Start by constructing the desired canonical IV.
7243 VPValue *IV = nullptr;
7244 if (Legal->getPrimaryInduction())
7245 IV = Plan->getVPValue(Legal->getPrimaryInduction());
7246 else {
7247 auto IVRecipe = new VPWidenCanonicalIVRecipe();
7248 Builder.getInsertBlock()->appendRecipe(IVRecipe);
7249 IV = IVRecipe->getVPValue();
7250 }
7251 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7252 bool TailFolded = !CM.isScalarEpilogueAllowed();
7253
7254 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7255 // While ActiveLaneMask is a binary op that consumes the loop tripcount
7256 // as a second argument, we only pass the IV here and extract the
7257 // tripcount from the transform state where codegen of the VP instructions
7258 // happen.
7259 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7260 } else {
7261 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7262 }
7263 return BlockMaskCache[BB] = BlockMask;
7264 }
7265
7266 // This is the block mask. We OR all incoming edges.
7267 for (auto *Predecessor : predecessors(BB)) {
7268 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7269 if (!EdgeMask
7.1
'EdgeMask' is non-null
10.1
'EdgeMask' is non-null
7.1
'EdgeMask' is non-null
10.1
'EdgeMask' is non-null
) // Mask of predecessor is all-one so mask of block is too.
8
Taking false branch
11
Taking false branch
22
Assuming 'EdgeMask' is null
23
Taking true branch
7270 return BlockMaskCache[BB] = EdgeMask;
24
Potential leak of memory pointed to by 'BlockMask'
7271
7272 if (!BlockMask
8.1
'BlockMask' is null
11.1
'BlockMask' is non-null
8.1
'BlockMask' is null
11.1
'BlockMask' is non-null
) { // BlockMask has its initialized nullptr value.
9
Taking true branch
12
Taking false branch
7273 BlockMask = EdgeMask;
7274 continue;
10
Execution continues on line 7267
7275 }
7276
7277 BlockMask = Builder.createOr(BlockMask, EdgeMask);
13
Calling 'VPBuilder::createOr'
21
Returned allocated memory
7278 }
7279
7280 return BlockMaskCache[BB] = BlockMask;
7281}
7282
7283VPWidenMemoryInstructionRecipe *
7284VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7285 VPlanPtr &Plan) {
7286 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store") ? static_cast<
void> (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7287, __PRETTY_FUNCTION__))
7287 "Must be called with either a load or store")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store") ? static_cast<
void> (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7287, __PRETTY_FUNCTION__))
;
7288
7289 auto willWiden = [&](ElementCount VF) -> bool {
7290 assert(!VF.isScalable() && "unexpected scalable ElementCount")((!VF.isScalable() && "unexpected scalable ElementCount"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"unexpected scalable ElementCount\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7290, __PRETTY_FUNCTION__))
;
7291 if (VF.isScalar())
7292 return false;
7293 LoopVectorizationCostModel::InstWidening Decision =
7294 CM.getWideningDecision(I, VF);
7295 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7296, __PRETTY_FUNCTION__))
7296 "CM decision should be taken at this point.")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7296, __PRETTY_FUNCTION__))
;
7297 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7298 return true;
7299 if (CM.isScalarAfterVectorization(I, VF) ||
7300 CM.isProfitableToScalarize(I, VF))
7301 return false;
7302 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7303 };
7304
7305 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7306 return nullptr;
7307
7308 VPValue *Mask = nullptr;
7309 if (Legal->isMaskRequired(I))
7310 Mask = createBlockInMask(I->getParent(), Plan);
7311
7312 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7313 if (LoadInst *Load = dyn_cast<LoadInst>(I))
7314 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7315
7316 StoreInst *Store = cast<StoreInst>(I);
7317 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7318 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7319}
7320
7321VPWidenIntOrFpInductionRecipe *
7322VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7323 // Check if this is an integer or fp induction. If so, build the recipe that
7324 // produces its scalar and vector values.
7325 InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7326 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7327 II.getKind() == InductionDescriptor::IK_FpInduction)
7328 return new VPWidenIntOrFpInductionRecipe(Phi);
7329
7330 return nullptr;
7331}
7332
7333VPWidenIntOrFpInductionRecipe *
7334VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7335 VFRange &Range) const {
7336 // Optimize the special case where the source is a constant integer
7337 // induction variable. Notice that we can only optimize the 'trunc' case
7338 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7339 // (c) other casts depend on pointer size.
7340
7341 // Determine whether \p K is a truncation based on an induction variable that
7342 // can be optimized.
7343 auto isOptimizableIVTruncate =
7344 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7345 return [=](ElementCount VF) -> bool {
7346 return CM.isOptimizableIVTruncate(K, VF);
7347 };
7348 };
7349
7350 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7351 isOptimizableIVTruncate(I), Range))
7352 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7353 I);
7354 return nullptr;
7355}
7356
7357VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7358 // We know that all PHIs in non-header blocks are converted into selects, so
7359 // we don't have to worry about the insertion order and we can just use the
7360 // builder. At this point we generate the predication tree. There may be
7361 // duplications since this is a simple recursive scan, but future
7362 // optimizations will clean it up.
7363
7364 SmallVector<VPValue *, 2> Operands;
7365 unsigned NumIncoming = Phi->getNumIncomingValues();
7366 for (unsigned In = 0; In < NumIncoming; In++) {
7367 VPValue *EdgeMask =
7368 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7369 assert((EdgeMask || NumIncoming == 1) &&(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7370, __PRETTY_FUNCTION__))
7370 "Multiple predecessors with one having a full mask")(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7370, __PRETTY_FUNCTION__))
;
7371 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7372 if (EdgeMask)
7373 Operands.push_back(EdgeMask);
7374 }
7375 return new VPBlendRecipe(Phi, Operands);
7376}
7377
7378VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7379 VPlan &Plan) const {
7380
7381 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7382 [this, CI](ElementCount VF) {
7383 return CM.isScalarWithPredication(CI, VF);
7384 },
7385 Range);
7386
7387 if (IsPredicated)
7388 return nullptr;
7389
7390 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7391 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7392 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7393 return nullptr;
7394
7395 auto willWiden = [&](ElementCount VF) -> bool {
7396 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7397 // The following case may be scalarized depending on the VF.
7398 // The flag shows whether we use Intrinsic or a usual Call for vectorized
7399 // version of the instruction.
7400 // Is it beneficial to perform intrinsic call compared to lib call?
7401 bool NeedToScalarize = false;
7402 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7403 bool UseVectorIntrinsic =
7404 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7405 return UseVectorIntrinsic || !NeedToScalarize;
7406 };
7407
7408 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7409 return nullptr;
7410
7411 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7412}
7413
7414bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7415 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&((!isa<BranchInst>(I) && !isa<PHINode>(I)
&& !isa<LoadInst>(I) && !isa<StoreInst
>(I) && "Instruction should have been handled earlier"
) ? static_cast<void> (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7416, __PRETTY_FUNCTION__))
7416 !isa<StoreInst>(I) && "Instruction should have been handled earlier")((!isa<BranchInst>(I) && !isa<PHINode>(I)
&& !isa<LoadInst>(I) && !isa<StoreInst
>(I) && "Instruction should have been handled earlier"
) ? static_cast<void> (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7416, __PRETTY_FUNCTION__))
;
7417 // Instruction should be widened, unless it is scalar after vectorization,
7418 // scalarization is profitable or it is predicated.
7419 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7420 return CM.isScalarAfterVectorization(I, VF) ||
7421 CM.isProfitableToScalarize(I, VF) ||
7422 CM.isScalarWithPredication(I, VF);
7423 };
7424 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7425 Range);
7426}
7427
7428VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7429 auto IsVectorizableOpcode = [](unsigned Opcode) {
7430 switch (Opcode) {
7431 case Instruction::Add:
7432 case Instruction::And:
7433 case Instruction::AShr:
7434 case Instruction::BitCast:
7435 case Instruction::FAdd:
7436 case Instruction::FCmp:
7437 case Instruction::FDiv:
7438 case Instruction::FMul:
7439 case Instruction::FNeg:
7440 case Instruction::FPExt:
7441 case Instruction::FPToSI:
7442 case Instruction::FPToUI:
7443 case Instruction::FPTrunc:
7444 case Instruction::FRem:
7445 case Instruction::FSub:
7446 case Instruction::ICmp:
7447 case Instruction::IntToPtr:
7448 case Instruction::LShr:
7449 case Instruction::Mul:
7450 case Instruction::Or:
7451 case Instruction::PtrToInt:
7452 case Instruction::SDiv:
7453 case Instruction::Select:
7454 case Instruction::SExt:
7455 case Instruction::Shl:
7456 case Instruction::SIToFP:
7457 case Instruction::SRem:
7458 case Instruction::Sub:
7459 case Instruction::Trunc:
7460 case Instruction::UDiv:
7461 case Instruction::UIToFP:
7462 case Instruction::URem:
7463 case Instruction::Xor:
7464 case Instruction::ZExt:
7465 return true;
7466 }
7467 return false;
7468 };
7469
7470 if (!IsVectorizableOpcode(I->getOpcode()))
7471 return nullptr;
7472
7473 // Success: widen this instruction.
7474 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7475}
7476
7477VPBasicBlock *VPRecipeBuilder::handleReplication(
7478 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7479 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7480 VPlanPtr &Plan) {
7481 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7482 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7483 Range);
7484
7485 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7486 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7487 Range);
7488
7489 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7490 IsUniform, IsPredicated);
7491 setRecipe(I, Recipe);
7492
7493 // Find if I uses a predicated instruction. If so, it will use its scalar
7494 // value. Avoid hoisting the insert-element which packs the scalar value into
7495 // a vector value, as that happens iff all users use the vector value.
7496 for (auto &Op : I->operands())
7497 if (auto *PredInst = dyn_cast<Instruction>(Op))
7498 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7499 PredInst2Recipe[PredInst]->setAlsoPack(false);
7500
7501 // Finalize the recipe for Instr, first if it is not predicated.
7502 if (!IsPredicated) {
7503 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
7504 VPBB->appendRecipe(Recipe);
7505 return VPBB;
7506 }
7507 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
7508 assert(VPBB->getSuccessors().empty() &&((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7509, __PRETTY_FUNCTION__))
7509 "VPBB has successors when handling predicated replication.")((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7509, __PRETTY_FUNCTION__))
;
7510 // Record predicated instructions for above packing optimizations.
7511 PredInst2Recipe[I] = Recipe;
7512 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7513 VPBlockUtils::insertBlockAfter(Region, VPBB);
7514 auto *RegSucc = new VPBasicBlock();
7515 VPBlockUtils::insertBlockAfter(RegSucc, Region);
7516 return RegSucc;
7517}
7518
7519VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7520 VPRecipeBase *PredRecipe,
7521 VPlanPtr &Plan) {
7522 // Instructions marked for predication are replicated and placed under an
7523 // if-then construct to prevent side-effects.
7524
7525 // Generate recipes to compute the block mask for this region.
7526 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
1
Calling 'VPRecipeBuilder::createBlockInMask'
7527
7528 // Build the triangular if-then region.
7529 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7530 assert(Instr->getParent() && "Predicated instruction not in any basic block")((Instr->getParent() && "Predicated instruction not in any basic block"
) ? static_cast<void> (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7530, __PRETTY_FUNCTION__))
;
7531 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7532 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7533 auto *PHIRecipe =
7534 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7535 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7536 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7537 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7538
7539 // Note: first set Entry as region entry and then connect successors starting
7540 // from it in order, to propagate the "parent" of each VPBasicBlock.
7541 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7542 VPBlockUtils::connectBlocks(Pred, Exit);
7543
7544 return Region;
7545}
7546
7547VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7548 VFRange &Range,
7549 VPlanPtr &Plan) {
7550 // First, check for specific widening recipes that deal with calls, memory
7551 // operations, inductions and Phi nodes.
7552 if (auto *CI = dyn_cast<CallInst>(Instr))
7553 return tryToWidenCall(CI, Range, *Plan);
7554
7555 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7556 return tryToWidenMemory(Instr, Range, Plan);
7557
7558 VPRecipeBase *Recipe;
7559 if (auto Phi = dyn_cast<PHINode>(Instr)) {
7560 if (Phi->getParent() != OrigLoop->getHeader())
7561 return tryToBlend(Phi, Plan);
7562 if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7563 return Recipe;
7564 return new VPWidenPHIRecipe(Phi);
7565 }
7566
7567 if (isa<TruncInst>(Instr) &&
7568 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7569 return Recipe;
7570
7571 if (!shouldWiden(Instr, Range))
7572 return nullptr;
7573
7574 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7575 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7576 OrigLoop);
7577
7578 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7579 bool InvariantCond =
7580 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7581 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7582 InvariantCond);
7583 }
7584
7585 return tryToWiden(Instr, *Plan);
7586}
7587
7588void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7589 unsigned MaxVF) {
7590 assert(OrigLoop->isInnermost() && "Inner loop expected.")((OrigLoop->isInnermost() && "Inner loop expected."
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7590, __PRETTY_FUNCTION__))
;
7591
7592 // Collect conditions feeding internal conditional branches; they need to be
7593 // represented in VPlan for it to model masking.
7594 SmallPtrSet<Value *, 1> NeedDef;
7595
7596 auto *Latch = OrigLoop->getLoopLatch();
7597 for (BasicBlock *BB : OrigLoop->blocks()) {
7598 if (BB == Latch)
7599 continue;
7600 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7601 if (Branch && Branch->isConditional())
7602 NeedDef.insert(Branch->getCondition());
7603 }
7604
7605 // If the tail is to be folded by masking, the primary induction variable, if
7606 // exists needs to be represented in VPlan for it to model early-exit masking.
7607 // Also, both the Phi and the live-out instruction of each reduction are
7608 // required in order to introduce a select between them in VPlan.
7609 if (CM.foldTailByMasking()) {
7610 if (Legal->getPrimaryInduction())
7611 NeedDef.insert(Legal->getPrimaryInduction());
7612 for (auto &Reduction : Legal->getReductionVars()) {
7613 NeedDef.insert(Reduction.first);
7614 NeedDef.insert(Reduction.second.getLoopExitInstr());
7615 }
7616 }
7617
7618 // Collect instructions from the original loop that will become trivially dead
7619 // in the vectorized loop. We don't need to vectorize these instructions. For
7620 // example, original induction update instructions can become dead because we
7621 // separately emit induction "steps" when generating code for the new loop.
7622 // Similarly, we create a new latch condition when setting up the structure
7623 // of the new loop, so the old one can become dead.
7624 SmallPtrSet<Instruction *, 4> DeadInstructions;
7625 collectTriviallyDeadInstructions(DeadInstructions);
7626
7627 // Add assume instructions we need to drop to DeadInstructions, to prevent
7628 // them from being added to the VPlan.
7629 // TODO: We only need to drop assumes in blocks that get flattend. If the
7630 // control flow is preserved, we should keep them.
7631 auto &ConditionalAssumes = Legal->getConditionalAssumes();
7632 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7633
7634 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7635 // Dead instructions do not need sinking. Remove them from SinkAfter.
7636 for (Instruction *I : DeadInstructions)
7637 SinkAfter.erase(I);
7638
7639 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7640 VFRange SubRange = {VF, MaxVF + 1};
7641 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7642 DeadInstructions, SinkAfter));
7643 VF = SubRange.End;
7644 }
7645}
7646
7647VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7648 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7649 SmallPtrSetImpl<Instruction *> &DeadInstructions,
7650 const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7651
7652 // Hold a mapping from predicated instructions to their recipes, in order to
7653 // fix their AlsoPack behavior if a user is determined to replicate and use a
7654 // scalar instead of vector value.
7655 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7656
7657 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7658
7659 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7660
7661 // ---------------------------------------------------------------------------
7662 // Pre-construction: record ingredients whose recipes we'll need to further
7663 // process after constructing the initial VPlan.
7664 // ---------------------------------------------------------------------------
7665
7666 // Mark instructions we'll need to sink later and their targets as
7667 // ingredients whose recipe we'll need to record.
7668 for (auto &Entry : SinkAfter) {
7669 RecipeBuilder.recordRecipeOf(Entry.first);
7670 RecipeBuilder.recordRecipeOf(Entry.second);
7671 }
7672 for (auto &Reduction : CM.getInLoopReductionChains()) {
7673 PHINode *Phi = Reduction.first;
7674 RecurrenceDescriptor::RecurrenceKind Kind =
7675 Legal->getReductionVars()[Phi].getRecurrenceKind();
7676 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7677
7678 RecipeBuilder.recordRecipeOf(Phi);
7679 for (auto &R : ReductionOperations) {
7680 RecipeBuilder.recordRecipeOf(R);
7681 // For min/max reducitons, where we have a pair of icmp/select, we also
7682 // need to record the ICmp recipe, so it can be removed later.
7683 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7684 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7685 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7686 }
7687 }
7688 }
7689
7690 // For each interleave group which is relevant for this (possibly trimmed)
7691 // Range, add it to the set of groups to be later applied to the VPlan and add
7692 // placeholders for its members' Recipes which we'll be replacing with a
7693 // single VPInterleaveRecipe.
7694 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7695 auto applyIG = [IG, this](ElementCount VF) -> bool {
7696 return (VF.isVector() && // Query is illegal for VF == 1
7697 CM.getWideningDecision(IG->getInsertPos(), VF) ==
7698 LoopVectorizationCostModel::CM_Interleave);
7699 };
7700 if (!getDecisionAndClampRange(applyIG, Range))
7701 continue;
7702 InterleaveGroups.insert(IG);
7703 for (unsigned i = 0; i < IG->getFactor(); i++)
7704 if (Instruction *Member = IG->getMember(i))
7705 RecipeBuilder.recordRecipeOf(Member);
7706 };
7707
7708 // ---------------------------------------------------------------------------
7709 // Build initial VPlan: Scan the body of the loop in a topological order to
7710 // visit each basic block after having visited its predecessor basic blocks.
7711 // ---------------------------------------------------------------------------
7712
7713 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7714 auto Plan = std::make_unique<VPlan>();
7715 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7716 Plan->setEntry(VPBB);
7717
7718 // Represent values that will have defs inside VPlan.
7719 for (Value *V : NeedDef)
7720 Plan->addVPValue(V);
7721
7722 // Scan the body of the loop in a topological order to visit each basic block
7723 // after having visited its predecessor basic blocks.
7724 LoopBlocksDFS DFS(OrigLoop);
7725 DFS.perform(LI);
7726
7727 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7728 // Relevant instructions from basic block BB will be grouped into VPRecipe
7729 // ingredients and fill a new VPBasicBlock.
7730 unsigned VPBBsForBB = 0;
7731 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7732 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7733 VPBB = FirstVPBBForBB;
7734 Builder.setInsertPoint(VPBB);
7735
7736 // Introduce each ingredient into VPlan.
7737 // TODO: Model and preserve debug instrinsics in VPlan.
7738 for (Instruction &I : BB->instructionsWithoutDebug()) {
7739 Instruction *Instr = &I;
7740
7741 // First filter out irrelevant instructions, to ensure no recipes are
7742 // built for them.
7743 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7744 continue;
7745
7746 if (auto Recipe =
7747 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7748 RecipeBuilder.setRecipe(Instr, Recipe);
7749 VPBB->appendRecipe(Recipe);
7750 continue;
7751 }
7752
7753 // Otherwise, if all widening options failed, Instruction is to be
7754 // replicated. This may create a successor for VPBB.
7755 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7756 Instr, Range, VPBB, PredInst2Recipe, Plan);
7757 if (NextVPBB != VPBB) {
7758 VPBB = NextVPBB;
7759 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7760 : "");
7761 }
7762 }
7763 }
7764
7765 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7766 // may also be empty, such as the last one VPBB, reflecting original
7767 // basic-blocks with no recipes.
7768 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7769 assert(PreEntry->empty() && "Expecting empty pre-entry block.")((PreEntry->empty() && "Expecting empty pre-entry block."
) ? static_cast<void> (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7769, __PRETTY_FUNCTION__))
;
7770 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7771 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7772 delete PreEntry;
7773
7774 // ---------------------------------------------------------------------------
7775 // Transform initial VPlan: Apply previously taken decisions, in order, to
7776 // bring the VPlan to its final state.
7777 // ---------------------------------------------------------------------------
7778
7779 // Apply Sink-After legal constraints.
7780 for (auto &Entry : SinkAfter) {
7781 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7782 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7783 Sink->moveAfter(Target);
7784 }
7785
7786 // Interleave memory: for each Interleave Group we marked earlier as relevant
7787 // for this VPlan, replace the Recipes widening its memory instructions with a
7788 // single VPInterleaveRecipe at its insertion point.
7789 for (auto IG : InterleaveGroups) {
7790 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7791 RecipeBuilder.getRecipe(IG->getInsertPos()));
7792 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7793 ->insertBefore(Recipe);
7794
7795 for (unsigned i = 0; i < IG->getFactor(); ++i)
7796 if (Instruction *Member = IG->getMember(i)) {
7797 RecipeBuilder.getRecipe(Member)->eraseFromParent();
7798 }
7799 }
7800
7801 // Adjust the recipes for any inloop reductions.
7802 if (Range.Start > 1)
7803 adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7804
7805 // Finally, if tail is folded by masking, introduce selects between the phi
7806 // and the live-out instruction of each reduction, at the end of the latch.
7807 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7808 Builder.setInsertPoint(VPBB);
7809 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7810 for (auto &Reduction : Legal->getReductionVars()) {
7811 assert(!CM.isInLoopReduction(Reduction.first) &&((!CM.isInLoopReduction(Reduction.first) && "Didn't expect inloop tail folded reduction yet!"
) ? static_cast<void> (0) : __assert_fail ("!CM.isInLoopReduction(Reduction.first) && \"Didn't expect inloop tail folded reduction yet!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7812, __PRETTY_FUNCTION__))
7812 "Didn't expect inloop tail folded reduction yet!")((!CM.isInLoopReduction(Reduction.first) && "Didn't expect inloop tail folded reduction yet!"
) ? static_cast<void> (0) : __assert_fail ("!CM.isInLoopReduction(Reduction.first) && \"Didn't expect inloop tail folded reduction yet!\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7812, __PRETTY_FUNCTION__))
;
7813 VPValue *Phi = Plan->getVPValue(Reduction.first);
7814 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7815 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7816 }
7817 }
7818
7819 std::string PlanName;
7820 raw_string_ostream RSO(PlanName);
7821 ElementCount VF = ElementCount::getFixed(Range.Start);
7822 Plan->addVF(VF);
7823 RSO << "Initial VPlan for VF={" << VF;
7824 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
7825 Plan->addVF(VF);
7826 RSO << "," << VF;
7827 }
7828 RSO << "},UF>=1";
7829 RSO.flush();
7830 Plan->setName(PlanName);
7831
7832 return Plan;
7833}
7834
7835VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7836 // Outer loop handling: They may require CFG and instruction level
7837 // transformations before even evaluating whether vectorization is profitable.
7838 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7839 // the vectorization pipeline.
7840 assert(!OrigLoop->isInnermost())((!OrigLoop->isInnermost()) ? static_cast<void> (0) :
__assert_fail ("!OrigLoop->isInnermost()", "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7840, __PRETTY_FUNCTION__))
;
7841 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7841, __PRETTY_FUNCTION__))
;
7842
7843 // Create new empty VPlan
7844 auto Plan = std::make_unique<VPlan>();
7845
7846 // Build hierarchical CFG
7847 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7848 HCFGBuilder.buildHierarchicalCFG();
7849
7850 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7851 Plan->addVF(ElementCount::getFixed(VF));
7852
7853 if (EnableVPlanPredication) {
7854 VPlanPredicator VPP(*Plan);
7855 VPP.predicate();
7856
7857 // Avoid running transformation to recipes until masked code generation in
7858 // VPlan-native path is in place.
7859 return Plan;
7860 }
7861
7862 SmallPtrSet<Instruction *, 1> DeadInstructions;
7863 VPlanTransforms::VPInstructionsToVPRecipes(
7864 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7865 return Plan;
7866}
7867
7868// Adjust the recipes for any inloop reductions. The chain of instructions
7869// leading from the loop exit instr to the phi need to be converted to
7870// reductions, with one operand being vector and the other being the scalar
7871// reduction chain.
7872void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7873 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7874 for (auto &Reduction : CM.getInLoopReductionChains()) {
7875 PHINode *Phi = Reduction.first;
7876 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7877 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7878
7879 // ReductionOperations are orders top-down from the phi's use to the
7880 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7881 // which of the two operands will remain scalar and which will be reduced.
7882 // For minmax the chain will be the select instructions.
7883 Instruction *Chain = Phi;
7884 for (Instruction *R : ReductionOperations) {
7885 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7886 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7887
7888 VPValue *ChainOp = Plan->getVPValue(Chain);
7889 unsigned FirstOpId;
7890 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7891 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7892 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&((WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC
&& "Expected to replace a VPWidenSelectSC") ? static_cast
<void> (0) : __assert_fail ("WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && \"Expected to replace a VPWidenSelectSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7893, __PRETTY_FUNCTION__))
7893 "Expected to replace a VPWidenSelectSC")((WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC
&& "Expected to replace a VPWidenSelectSC") ? static_cast
<void> (0) : __assert_fail ("WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && \"Expected to replace a VPWidenSelectSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7893, __PRETTY_FUNCTION__))
;
7894 FirstOpId = 1;
7895 } else {
7896 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&((WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
"Expected to replace a VPWidenSC") ? static_cast<void>
(0) : __assert_fail ("WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7897, __PRETTY_FUNCTION__))
7897 "Expected to replace a VPWidenSC")((WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
"Expected to replace a VPWidenSC") ? static_cast<void>
(0) : __assert_fail ("WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7897, __PRETTY_FUNCTION__))
;
7898 FirstOpId = 0;
7899 }
7900 unsigned VecOpId =
7901 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7902 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7903
7904 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7905 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7906 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7907 WidenRecipe->eraseFromParent();
7908
7909 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7910 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7911 VPRecipeBase *CompareRecipe =
7912 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7913 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&((CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC
&& "Expected to replace a VPWidenSC") ? static_cast<
void> (0) : __assert_fail ("CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7914, __PRETTY_FUNCTION__))
7914 "Expected to replace a VPWidenSC")((CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC
&& "Expected to replace a VPWidenSC") ? static_cast<
void> (0) : __assert_fail ("CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7914, __PRETTY_FUNCTION__))
;
7915 CompareRecipe->eraseFromParent();
7916 }
7917 Chain = R;
7918 }
7919 }
7920}
7921
7922Value* LoopVectorizationPlanner::VPCallbackILV::
7923getOrCreateVectorValues(Value *V, unsigned Part) {
7924 return ILV.getOrCreateVectorValue(V, Part);
7925}
7926
7927Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7928 Value *V, const VPIteration &Instance) {
7929 return ILV.getOrCreateScalarValue(V, Instance);
7930}
7931
7932void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7933 VPSlotTracker &SlotTracker) const {
7934 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7935 IG->getInsertPos()->printAsOperand(O, false);
7936 O << ", ";
7937 getAddr()->printAsOperand(O, SlotTracker);
7938 VPValue *Mask = getMask();
7939 if (Mask) {
7940 O << ", ";
7941 Mask->printAsOperand(O, SlotTracker);
7942 }
7943 for (unsigned i = 0; i < IG->getFactor(); ++i)
7944 if (Instruction *I = IG->getMember(i))
7945 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
7946}
7947
7948void VPWidenCallRecipe::execute(VPTransformState &State) {
7949 State.ILV->widenCallInstruction(Ingredient, User, State);
7950}
7951
7952void VPWidenSelectRecipe::execute(VPTransformState &State) {
7953 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7954}
7955
7956void VPWidenRecipe::execute(VPTransformState &State) {
7957 State.ILV->widenInstruction(Ingredient, User, State);
7958}
7959
7960void VPWidenGEPRecipe::execute(VPTransformState &State) {
7961 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7962 IsIndexLoopInvariant, State);
7963}
7964
7965void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7966 assert(!State.Instance && "Int or FP induction being replicated.")((!State.Instance && "Int or FP induction being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7966, __PRETTY_FUNCTION__))
;
7967 State.ILV->widenIntOrFpInduction(IV, Trunc);
7968}
7969
7970void VPWidenPHIRecipe::execute(VPTransformState &State) {
7971 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7972}
7973
7974void VPBlendRecipe::execute(VPTransformState &State) {
7975 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7976 // We know that all PHIs in non-header blocks are converted into
7977 // selects, so we don't have to worry about the insertion order and we
7978 // can just use the builder.
7979 // At this point we generate the predication tree. There may be
7980 // duplications since this is a simple recursive scan, but future
7981 // optimizations will clean it up.
7982
7983 unsigned NumIncoming = getNumIncomingValues();
7984
7985 // Generate a sequence of selects of the form:
7986 // SELECT(Mask3, In3,
7987 // SELECT(Mask2, In2,
7988 // SELECT(Mask1, In1,
7989 // In0)))
7990 // Note that Mask0 is never used: lanes for which no path reaches this phi and
7991 // are essentially undef are taken from In0.
7992 InnerLoopVectorizer::VectorParts Entry(State.UF);
7993 for (unsigned In = 0; In < NumIncoming; ++In) {
7994 for (unsigned Part = 0; Part < State.UF; ++Part) {
7995 // We might have single edge PHIs (blocks) - use an identity
7996 // 'select' for the first PHI operand.
7997 Value *In0 = State.get(getIncomingValue(In), Part);
7998 if (In == 0)
7999 Entry[Part] = In0; // Initialize with the first incoming value.
8000 else {
8001 // Select between the current value and the previous incoming edge
8002 // based on the incoming mask.
8003 Value *Cond = State.get(getMask(In), Part);
8004 Entry[Part] =
8005 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8006 }
8007 }
8008 }
8009 for (unsigned Part = 0; Part < State.UF; ++Part)
8010 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8011}
8012
8013void VPInterleaveRecipe::execute(VPTransformState &State) {
8014 assert(!State.Instance && "Interleave group being replicated.")((!State.Instance && "Interleave group being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8014, __PRETTY_FUNCTION__))
;
8015 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
8016}
8017
8018void VPReductionRecipe::execute(VPTransformState &State) {
8019 assert(!State.Instance && "Reduction being replicated.")((!State.Instance && "Reduction being replicated.") ?
static_cast<void> (0) : __assert_fail ("!State.Instance && \"Reduction being replicated.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8019, __PRETTY_FUNCTION__))
;
8020 for (unsigned Part = 0; Part < State.UF; ++Part) {
8021 unsigned Kind = RdxDesc->getRecurrenceKind();
8022 Value *NewVecOp = State.get(VecOp, Part);
8023 Value *NewRed =
8024 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8025 Value *PrevInChain = State.get(ChainOp, Part);
8026 Value *NextInChain;
8027 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8028 Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8029 NextInChain =
8030 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8031 NewRed, PrevInChain);
8032 } else {
8033 NextInChain = State.Builder.CreateBinOp(
8034 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
8035 }
8036 State.ValueMap.setVectorValue(I, Part, NextInChain);
8037 }
8038}
8039
8040void VPReplicateRecipe::execute(VPTransformState &State) {
8041 if (State.Instance) { // Generate a single instance.
8042 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
8043 IsPredicated, State);
8044 // Insert scalar instance packing it into a vector.
8045 if (AlsoPack && State.VF.isVector()) {
8046 // If we're constructing lane 0, initialize to start from undef.
8047 if (State.Instance->Lane == 0) {
8048 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.")((!State.VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!State.VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8048, __PRETTY_FUNCTION__))
;
8049 Value *Undef =
8050 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8051 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8052 }
8053 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8054 }
8055 return;
8056 }
8057
8058 // Generate scalar instances for all VF lanes of all UF parts, unless the
8059 // instruction is uniform inwhich case generate only the first lane for each
8060 // of the UF parts.
8061 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8062 for (unsigned Part = 0; Part < State.UF; ++Part)
8063 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8064 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
8065 IsPredicated, State);
8066}
8067
8068void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8069 assert(State.Instance && "Branch on Mask works only on single instance.")((State.Instance && "Branch on Mask works only on single instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8069, __PRETTY_FUNCTION__))
;
8070
8071 unsigned Part = State.Instance->Part;
8072 unsigned Lane = State.Instance->Lane;
8073
8074 Value *ConditionBit = nullptr;
8075 VPValue *BlockInMask = getMask();
8076 if (BlockInMask) {
8077 ConditionBit = State.get(BlockInMask, Part);
8078 if (ConditionBit->getType()->isVectorTy())
8079 ConditionBit = State.Builder.CreateExtractElement(
8080 ConditionBit, State.Builder.getInt32(Lane));
8081 } else // Block in mask is all-one.
8082 ConditionBit = State.Builder.getTrue();
8083
8084 // Replace the temporary unreachable terminator with a new conditional branch,
8085 // whose two destinations will be set later when they are created.
8086 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8087 assert(isa<UnreachableInst>(CurrentTerminator) &&((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8088, __PRETTY_FUNCTION__))
8088 "Expected to replace unreachable terminator with conditional branch.")((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8088, __PRETTY_FUNCTION__))
;
8089 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8090 CondBr->setSuccessor(0, nullptr);
8091 ReplaceInstWithInst(CurrentTerminator, CondBr);
8092}
8093
8094void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8095 assert(State.Instance && "Predicated instruction PHI works per instance.")((State.Instance && "Predicated instruction PHI works per instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8095, __PRETTY_FUNCTION__))
;
8096 Instruction *ScalarPredInst = cast<Instruction>(
8097 State.ValueMap.getScalarValue(PredInst, *State.Instance));
8098 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8099 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8100 assert(PredicatingBB && "Predicated block has no single predecessor.")((PredicatingBB && "Predicated block has no single predecessor."
) ? static_cast<void> (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8100, __PRETTY_FUNCTION__))
;
8101
8102 // By current pack/unpack logic we need to generate only a single phi node: if
8103 // a vector value for the predicated instruction exists at this point it means
8104 // the instruction has vector users only, and a phi for the vector value is
8105 // needed. In this case the recipe of the predicated instruction is marked to
8106 // also do that packing, thereby "hoisting" the insert-element sequence.
8107 // Otherwise, a phi node for the scalar value is needed.
8108 unsigned Part = State.Instance->Part;
8109 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8110 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8111 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8112 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8113 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8114 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8115 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8116 } else {
8117 Type *PredInstType = PredInst->getType();
8118 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8119 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8120 Phi->addIncoming(ScalarPredInst, PredicatedBB);
8121 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8122 }
8123}
8124
8125void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8126 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8127 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8128 getMask());
8129}
8130
8131// Determine how to lower the scalar epilogue, which depends on 1) optimising
8132// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8133// predication, and 4) a TTI hook that analyses whether the loop is suitable
8134// for predication.
8135static ScalarEpilogueLowering getScalarEpilogueLowering(
8136 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8137 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8138 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8139 LoopVectorizationLegality &LVL) {
8140 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8141 // don't look at hints or options, and don't request a scalar epilogue.
8142 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8143 // LoopAccessInfo (due to code dependency and not being able to reliably get
8144 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8145 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8146 // versioning when the vectorization is forced, unlike hasOptSize. So revert
8147 // back to the old way and vectorize with versioning when forced. See D81345.)
8148 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8149 PGSOQueryType::IRPass) &&
8150 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8151 return CM_ScalarEpilogueNotAllowedOptSize;
8152
8153 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8154 !PreferPredicateOverEpilogue;
8155
8156 // 2) Next, if disabling predication is requested on the command line, honour
8157 // this and request a scalar epilogue.
8158 if (PredicateOptDisabled)
8159 return CM_ScalarEpilogueAllowed;
8160
8161 // 3) and 4) look if enabling predication is requested on the command line,
8162 // with a loop hint, or if the TTI hook indicates this is profitable, request
8163 // predication.
8164 if (PreferPredicateOverEpilogue ||
8165 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8166 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8167 LVL.getLAI()) &&
8168 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8169 return CM_ScalarEpilogueNotNeededUsePredicate;
8170
8171 return CM_ScalarEpilogueAllowed;
8172}
8173
8174// Process the loop in the VPlan-native vectorization path. This path builds
8175// VPlan upfront in the vectorization pipeline, which allows to apply
8176// VPlan-to-VPlan transformations from the very beginning without modifying the
8177// input LLVM IR.
8178static bool processLoopInVPlanNativePath(
8179 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8180 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8181 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8182 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8183 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8184
8185 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8186 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: cannot compute the outer-loop trip count\n"
; } } while (false)
;
8187 return false;
8188 }
8189 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")((EnableVPlanNativePath && "VPlan-native path is disabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8189, __PRETTY_FUNCTION__))
;
8190 Function *F = L->getHeader()->getParent();
8191 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8192
8193 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8194 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8195
8196 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8197 &Hints, IAI);
8198 // Use the planner for outer loop vectorization.
8199 // TODO: CM is not used at this point inside the planner. Turn CM into an
8200 // optional argument if we don't need it in the future.
8201 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8202
8203 // Get user vectorization factor.
8204 const unsigned UserVF = Hints.getWidth();
8205
8206 // Plan how to best vectorize, return the best VF and its cost.
8207 const VectorizationFactor VF =
8208 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8209
8210 // If we are stress testing VPlan builds, do not attempt to generate vector
8211 // code. Masked vector code generation support will follow soon.
8212 // Also, do not attempt to vectorize if no vector code will be produced.
8213 if (VPlanBuildStressTest || EnableVPlanPredication ||
8214 VectorizationFactor::Disabled() == VF)
8215 return false;
8216
8217 LVP.setBestPlan(VF.Width, 1);
8218
8219 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8220 &CM, BFI, PSI);
8221 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
8222 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
8223 LVP.executePlan(LB, DT);
8224
8225 // Mark the loop as already vectorized to avoid vectorizing again.
8226 Hints.setAlreadyVectorized();
8227
8228 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))((!verifyFunction(*L->getHeader()->getParent(), &dbgs
())) ? static_cast<void> (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8228, __PRETTY_FUNCTION__))
;
8229 return true;
8230}
8231
8232LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8233 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8234 !EnableLoopInterleaving),
8235 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8236 !EnableLoopVectorization) {}
8237
8238bool LoopVectorizePass::processLoop(Loop *L) {
8239 assert((EnableVPlanNativePath || L->isInnermost()) &&(((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8240, __PRETTY_FUNCTION__))
8240 "VPlan-native path is not enabled. Only process inner loops.")(((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8240, __PRETTY_FUNCTION__))
;
8241
8242#ifndef NDEBUG
8243 const std::string DebugLocStr = getDebugLocString(L);
8244#endif /* NDEBUG */
8245
8246 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
8247 << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
8248 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
;
8249
8250 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8251
8252 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8253 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8254 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8255 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8256 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8257 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8258 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8259 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8260 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
8261 << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
;
8262
8263 // Function containing loop
8264 Function *F = L->getHeader()->getParent();
8265
8266 // Looking at the diagnostic output is the only way to determine if a loop
8267 // was vectorized (other than looking at the IR or machine code), so it
8268 // is important to generate an optimization remark for each loop. Most of
8269 // these messages are generated as OptimizationRemarkAnalysis. Remarks
8270 // generated as OptimizationRemark and OptimizationRemarkMissed are
8271 // less verbose reporting vectorized loops and unvectorized loops that may
8272 // benefit from vectorization, respectively.
8273
8274 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8275 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
8276 return false;
8277 }
8278
8279 PredicatedScalarEvolution PSE(*SE, *L);
8280
8281 // Check if it is legal to vectorize the loop.
8282 LoopVectorizationRequirements Requirements(*ORE);
8283 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8284 &Requirements, &Hints, DB, AC, BFI, PSI);
8285 if (!LVL.canVectorize(EnableVPlanNativePath)) {
8286 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
8287 Hints.emitRemarkWithHints();
8288 return false;
8289 }
8290
8291 // Check the function attributes and profiles to find out if this function
8292 // should be optimized for size.
8293 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8294 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8295
8296 // Entrance to the VPlan-native vectorization path. Outer loops are processed
8297 // here. They may require CFG and instruction level transformations before
8298 // even evaluating whether vectorization is profitable. Since we cannot modify
8299 // the incoming IR, we need to build VPlan upfront in the vectorization
8300 // pipeline.
8301 if (!L->isInnermost())
8302 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8303 ORE, BFI, PSI, Hints);
8304
8305 assert(L->isInnermost() && "Inner loop expected.")((L->isInnermost() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("L->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8305, __PRETTY_FUNCTION__))
;
8306
8307 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8308 // count by optimizing for size, to minimize overheads.
8309 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8310 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8311 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
8312 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
8313 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
8314 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8315 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
8316 else {
8317 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
8318 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8319 }
8320 }
8321
8322 // Check the function attributes to see if implicit floats are allowed.
8323 // FIXME: This check doesn't seem possibly correct -- what if the loop is
8324 // an integer loop and the vector instructions selected are purely integer
8325 // vector instructions?
8326 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8327 reportVectorizationFailure(
8328 "Can't vectorize when the NoImplicitFloat attribute is used",
8329 "loop not vectorized due to NoImplicitFloat attribute",
8330 "NoImplicitFloat", ORE, L);
8331 Hints.emitRemarkWithHints();
8332 return false;
8333 }
8334
8335 // Check if the target supports potentially unsafe FP vectorization.
8336 // FIXME: Add a check for the type of safety issue (denormal, signaling)
8337 // for the target we're vectorizing for, to make sure none of the
8338 // additional fp-math flags can help.
8339 if (Hints.isPotentiallyUnsafe() &&
8340 TTI->isFPVectorizationPotentiallyUnsafe()) {
8341 reportVectorizationFailure(
8342 "Potentially unsafe FP op prevents vectorization",
8343 "loop not vectorized due to unsafe FP support.",
8344 "UnsafeFP", ORE, L);
8345 Hints.emitRemarkWithHints();
8346 return false;
8347 }
8348
8349 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8350 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8351
8352 // If an override option has been passed in for interleaved accesses, use it.
8353 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8354 UseInterleaved = EnableInterleavedMemAccesses;
8355
8356 // Analyze interleaved memory accesses.
8357 if (UseInterleaved) {
8358 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8359 }
8360
8361 // Use the cost model.
8362 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8363 F, &Hints, IAI);
8364 CM.collectValuesToIgnore();
8365
8366 // Use the planner for vectorization.
8367 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8368
8369 // Get user vectorization factor and interleave count.
8370 unsigned UserVF = Hints.getWidth();
8371 unsigned UserIC = Hints.getInterleave();
8372
8373 // Plan how to best vectorize, return the best VF and its cost.
8374 Optional<VectorizationFactor> MaybeVF =
8375 LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8376
8377 VectorizationFactor VF = VectorizationFactor::Disabled();
8378 unsigned IC = 1;
8379
8380 if (MaybeVF) {
8381 VF = *MaybeVF;
8382 // Select the interleave count.
8383 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8384 }
8385
8386 // Identify the diagnostic messages that should be produced.
8387 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8388 bool VectorizeLoop = true, InterleaveLoop = true;
8389 if (Requirements.doesNotMeet(F, L, Hints)) {
8390 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
8391 "requirements.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
;
8392 Hints.emitRemarkWithHints();
8393 return false;
8394 }
8395
8396 if (VF.Width == 1) {
8397 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
8398 VecDiagMsg = std::make_pair(
8399 "VectorizationNotBeneficial",
8400 "the cost-model indicates that vectorization is not beneficial");
8401 VectorizeLoop = false;
8402 }
8403
8404 if (!MaybeVF && UserIC > 1) {
8405 // Tell the user interleaving was avoided up-front, despite being explicitly
8406 // requested.
8407 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
8408 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
8409 IntDiagMsg = std::make_pair(
8410 "InterleavingAvoided",
8411 "Ignoring UserIC, because interleaving was avoided up front");
8412 InterleaveLoop = false;
8413 } else if (IC == 1 && UserIC <= 1) {
8414 // Tell the user interleaving is not beneficial.
8415 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
8416 IntDiagMsg = std::make_pair(
8417 "InterleavingNotBeneficial",
8418 "the cost-model indicates that interleaving is not beneficial");
8419 InterleaveLoop = false;
8420 if (UserIC == 1) {
8421 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8422 IntDiagMsg.second +=
8423 " and is explicitly disabled or interleave count is set to 1";
8424 }
8425 } else if (IC > 1 && UserIC == 1) {
8426 // Tell the user interleaving is beneficial, but it explicitly disabled.
8427 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
8428 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
8429 IntDiagMsg = std::make_pair(
8430 "InterleavingBeneficialButDisabled",
8431 "the cost-model indicates that interleaving is beneficial "
8432 "but is explicitly disabled or interleave count is set to 1");
8433 InterleaveLoop = false;
8434 }
8435
8436 // Override IC if user provided an interleave count.
8437 IC = UserIC > 0 ? UserIC : IC;
8438
8439 // Emit diagnostic messages, if any.
8440 const char *VAPassName = Hints.vectorizeAnalysisPassName();
8441 if (!VectorizeLoop && !InterleaveLoop) {
8442 // Do not vectorize or interleaving the loop.
8443 ORE->emit([&]() {
8444 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8445 L->getStartLoc(), L->getHeader())
8446 << VecDiagMsg.second;
8447 });
8448 ORE->emit([&]() {
8449 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
8450 L->getStartLoc(), L->getHeader())
8451 << IntDiagMsg.second;
8452 });
8453 return false;
8454 } else if (!VectorizeLoop && InterleaveLoop) {
8455 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
8456 ORE->emit([&]() {
8457 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8458 L->getStartLoc(), L->getHeader())
8459 << VecDiagMsg.second;
8460 });
8461 } else if (VectorizeLoop && !InterleaveLoop) {
8462 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
8463 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
8464 ORE->emit([&]() {
8465 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
8466 L->getStartLoc(), L->getHeader())
8467 << IntDiagMsg.second;
8468 });
8469 } else if (VectorizeLoop && InterleaveLoop) {
8470 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
8471 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
8472 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
8473 }
8474
8475 LVP.setBestPlan(VF.Width, IC);
8476
8477 using namespace ore;
8478 bool DisableRuntimeUnroll = false;
8479 MDNode *OrigLoopID = L->getLoopID();
8480
8481 if (!VectorizeLoop) {
8482 assert(IC > 1 && "interleave count should not be 1 or 0")((IC > 1 && "interleave count should not be 1 or 0"
) ? static_cast<void> (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8482, __PRETTY_FUNCTION__))
;
8483 // If we decided that it is not legal to vectorize the loop, then
8484 // interleave it.
8485 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8486 BFI, PSI);
8487 LVP.executePlan(Unroller, DT);
8488
8489 ORE->emit([&]() {
8490 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
8491 L->getHeader())
8492 << "interleaved loop (interleaved count: "
8493 << NV("InterleaveCount", IC) << ")";
8494 });
8495 } else {
8496 // If we decided that it is *legal* to vectorize the loop, then do it.
8497 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8498 &LVL, &CM, BFI, PSI);
8499 LVP.executePlan(LB, DT);
8500 ++LoopsVectorized;
8501
8502 // Add metadata to disable runtime unrolling a scalar loop when there are
8503 // no runtime checks about strides and memory. A scalar loop that is
8504 // rarely used is not worth unrolling.
8505 if (!LB.areSafetyChecksAdded())
8506 DisableRuntimeUnroll = true;
8507
8508 // Report the vectorization decision.
8509 ORE->emit([&]() {
8510 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
8511 L->getHeader())
8512 << "vectorized loop (vectorization width: "
8513 << NV("VectorizationFactor", VF.Width)
8514 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8515 });
8516 }
8517
8518 Optional<MDNode *> RemainderLoopID =
8519 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8520 LLVMLoopVectorizeFollowupEpilogue});
8521 if (RemainderLoopID.hasValue()) {
8522 L->setLoopID(RemainderLoopID.getValue());
8523 } else {
8524 if (DisableRuntimeUnroll)
8525 AddRuntimeUnrollDisableMetaData(L);
8526
8527 // Mark the loop as already vectorized to avoid vectorizing again.
8528 Hints.setAlreadyVectorized();
8529 }
8530
8531 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))((!verifyFunction(*L->getHeader()->getParent(), &dbgs
())) ? static_cast<void> (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8531, __PRETTY_FUNCTION__))
;
8532 return true;
8533}
8534
8535LoopVectorizeResult LoopVectorizePass::runImpl(
8536 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8537 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8538 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8539 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8540 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8541 SE = &SE_;
8542 LI = &LI_;
8543 TTI = &TTI_;
8544 DT = &DT_;
8545 BFI = &BFI_;
8546 TLI = TLI_;
8547 AA = &AA_;
8548 AC = &AC_;
8549 GetLAA = &GetLAA_;
8550 DB = &DB_;
8551 ORE = &ORE_;
8552 PSI = PSI_;
8553
8554 // Don't attempt if
8555 // 1. the target claims to have no vector registers, and
8556 // 2. interleaving won't help ILP.
8557 //
8558 // The second condition is necessary because, even if the target has no
8559 // vector registers, loop vectorization may still enable scalar
8560 // interleaving.
8561 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8562 TTI->getMaxInterleaveFactor(1) < 2)
8563 return LoopVectorizeResult(false, false);
8564
8565 bool Changed = false, CFGChanged = false;
8566
8567 // The vectorizer requires loops to be in simplified form.
8568 // Since simplification may add new inner loops, it has to run before the
8569 // legality and profitability checks. This means running the loop vectorizer
8570 // will simplify all loops, regardless of whether anything end up being
8571 // vectorized.
8572 for (auto &L : *LI)
8573 Changed |= CFGChanged |=
8574 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8575
8576 // Build up a worklist of inner-loops to vectorize. This is necessary as
8577 // the act of vectorizing or partially unrolling a loop creates new loops
8578 // and can invalidate iterators across the loops.
8579 SmallVector<Loop *, 8> Worklist;
8580
8581 for (Loop *L : *LI)
8582 collectSupportedLoops(*L, LI, ORE, Worklist);
8583
8584 LoopsAnalyzed += Worklist.size();
8585
8586 // Now walk the identified inner loops.
8587 while (!Worklist.empty()) {
8588 Loop *L = Worklist.pop_back_val();
8589
8590 // For the inner loops we actually process, form LCSSA to simplify the
8591 // transform.
8592 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8593
8594 Changed |= CFGChanged |= processLoop(L);
8595 }
8596
8597 // Process each loop nest in the function.
8598 return LoopVectorizeResult(Changed, CFGChanged);
8599}
8600
8601PreservedAnalyses LoopVectorizePass::run(Function &F,
8602 FunctionAnalysisManager &AM) {
8603 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8604 auto &LI = AM.getResult<LoopAnalysis>(F);
8605 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8606 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8607 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8608 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8609 auto &AA = AM.getResult<AAManager>(F);
8610 auto &AC = AM.getResult<AssumptionAnalysis>(F);
8611 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8612 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8613 MemorySSA *MSSA = EnableMSSALoopDependency
8614 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8615 : nullptr;
8616
8617 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8618 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8619 [&](Loop &L) -> const LoopAccessInfo & {
8620 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
8621 TLI, TTI, nullptr, MSSA};
8622 return LAM.getResult<LoopAccessAnalysis>(L, AR);
8623 };
8624 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8625 ProfileSummaryInfo *PSI =
8626 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8627 LoopVectorizeResult Result =
8628 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8629 if (!Result.MadeAnyChange)
8630 return PreservedAnalyses::all();
8631 PreservedAnalyses PA;
8632
8633 // We currently do not preserve loopinfo/dominator analyses with outer loop
8634 // vectorization. Until this is addressed, mark these analyses as preserved
8635 // only for non-VPlan-native path.
8636 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8637 if (!EnableVPlanNativePath) {
8638 PA.preserve<LoopAnalysis>();
8639 PA.preserve<DominatorTreeAnalysis>();
8640 }
8641 PA.preserve<BasicAA>();
8642 PA.preserve<GlobalsAA>();
8643 if (!Result.MadeCFGChange)
8644 PA.preserveSet<CFGAnalyses>();
8645 return PA;
8646}

/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides a LoopVectorizationPlanner class.
11/// InnerLoopVectorizer vectorizes loops which contain only one basic
12/// LoopVectorizationPlanner - drives the vectorization process after having
13/// passed Legality checks.
14/// The planner builds and optimizes the Vectorization Plans which record the
15/// decisions how to vectorize the given loop. In particular, represent the
16/// control-flow of the vectorized version, the replication of instructions that
17/// are to be scalarized, and interleave access groups.
18///
19/// Also provides a VPlan-based builder utility analogous to IRBuilder.
20/// It provides an instruction-level API for generating VPInstructions while
21/// abstracting away the Recipe manipulation details.
22//===----------------------------------------------------------------------===//
23
24#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
25#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
26
27#include "VPlan.h"
28#include "llvm/Analysis/LoopInfo.h"
29#include "llvm/Analysis/TargetLibraryInfo.h"
30#include "llvm/Analysis/TargetTransformInfo.h"
31
32namespace llvm {
33
34class LoopVectorizationLegality;
35class LoopVectorizationCostModel;
36class PredicatedScalarEvolution;
37class VPRecipeBuilder;
38
39/// VPlan-based builder utility analogous to IRBuilder.
40class VPBuilder {
41 VPBasicBlock *BB = nullptr;
42 VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
43
44 VPInstruction *createInstruction(unsigned Opcode,
45 ArrayRef<VPValue *> Operands) {
46 VPInstruction *Instr = new VPInstruction(Opcode, Operands);
16
Memory is allocated
47 if (BB)
17
Assuming field 'BB' is null
18
Taking false branch
48 BB->insert(Instr, InsertPt);
49 return Instr;
50 }
51
52 VPInstruction *createInstruction(unsigned Opcode,
53 std::initializer_list<VPValue *> Operands) {
54 return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
15
Calling 'VPBuilder::createInstruction'
19
Returned allocated memory
55 }
56
57public:
58 VPBuilder() {}
59
60 /// Clear the insertion point: created instructions will not be inserted into
61 /// a block.
62 void clearInsertionPoint() {
63 BB = nullptr;
64 InsertPt = VPBasicBlock::iterator();
65 }
66
67 VPBasicBlock *getInsertBlock() const { return BB; }
68 VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
69
70 /// InsertPoint - A saved insertion point.
71 class VPInsertPoint {
72 VPBasicBlock *Block = nullptr;
73 VPBasicBlock::iterator Point;
74
75 public:
76 /// Creates a new insertion point which doesn't point to anything.
77 VPInsertPoint() = default;
78
79 /// Creates a new insertion point at the given location.
80 VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
81 : Block(InsertBlock), Point(InsertPoint) {}
82
83 /// Returns true if this insert point is set.
84 bool isSet() const { return Block != nullptr; }
85
86 VPBasicBlock *getBlock() const { return Block; }
87 VPBasicBlock::iterator getPoint() const { return Point; }
88 };
89
90 /// Sets the current insert point to a previously-saved location.
91 void restoreIP(VPInsertPoint IP) {
92 if (IP.isSet())
93 setInsertPoint(IP.getBlock(), IP.getPoint());
94 else
95 clearInsertionPoint();
96 }
97
98 /// This specifies that created VPInstructions should be appended to the end
99 /// of the specified block.
100 void setInsertPoint(VPBasicBlock *TheBB) {
101 assert(TheBB && "Attempting to set a null insert point")((TheBB && "Attempting to set a null insert point") ?
static_cast<void> (0) : __assert_fail ("TheBB && \"Attempting to set a null insert point\""
, "/build/llvm-toolchain-snapshot-12~++20200927111121+5811d723998/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h"
, 101, __PRETTY_FUNCTION__))
;
102 BB = TheBB;
103 InsertPt = BB->end();
104 }
105
106 /// This specifies that created instructions should be inserted at the
107 /// specified point.
108 void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
109 BB = TheBB;
110 InsertPt = IP;
111 }
112
113 /// Insert and return the specified instruction.
114 VPInstruction *insert(VPInstruction *I) const {
115 BB->insert(I, InsertPt);
116 return I;
117 }
118
119 /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
120 /// its underlying Instruction.
121 VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
122 Instruction *Inst = nullptr) {
123 VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
124 NewVPInst->setUnderlyingValue(Inst);
125 return NewVPInst;
126 }
127 VPValue *createNaryOp(unsigned Opcode,
128 std::initializer_list<VPValue *> Operands,
129 Instruction *Inst = nullptr) {
130 return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
131 }
132
133 VPValue *createNot(VPValue *Operand) {
134 return createInstruction(VPInstruction::Not, {Operand});
135 }
136
137 VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
138 return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
139 }
140
141 VPValue *createOr(VPValue *LHS, VPValue *RHS) {
142 return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
14
Calling 'VPBuilder::createInstruction'
20
Returned allocated memory
143 }
144
145 //===--------------------------------------------------------------------===//
146 // RAII helpers.
147 //===--------------------------------------------------------------------===//
148
149 /// RAII object that stores the current insertion point and restores it when
150 /// the object is destroyed.
151 class InsertPointGuard {
152 VPBuilder &Builder;
153 VPBasicBlock *Block;
154 VPBasicBlock::iterator Point;
155
156 public:
157 InsertPointGuard(VPBuilder &B)
158 : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
159
160 InsertPointGuard(const InsertPointGuard &) = delete;
161 InsertPointGuard &operator=(const InsertPointGuard &) = delete;
162
163 ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
164 };
165};
166
167/// TODO: The following VectorizationFactor was pulled out of
168/// LoopVectorizationCostModel class. LV also deals with
169/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
170/// We need to streamline them.
171
172/// Information about vectorization costs
173struct VectorizationFactor {
174 // Vector width with best cost
175 ElementCount Width;
176 // Cost of the loop with that width
177 unsigned Cost;
178
179 // Width 1 means no vectorization, cost 0 means uncomputed cost.
180 static VectorizationFactor Disabled() {
181 return {ElementCount::getFixed(1), 0};
182 }
183
184 bool operator==(const VectorizationFactor &rhs) const {
185 return Width == rhs.Width && Cost == rhs.Cost;
186 }
187};
188
189/// Planner drives the vectorization process after having passed
190/// Legality checks.
191class LoopVectorizationPlanner {
192 /// The loop that we evaluate.
193 Loop *OrigLoop;
194
195 /// Loop Info analysis.
196 LoopInfo *LI;
197
198 /// Target Library Info.
199 const TargetLibraryInfo *TLI;
200
201 /// Target Transform Info.
202 const TargetTransformInfo *TTI;
203
204 /// The legality analysis.
205 LoopVectorizationLegality *Legal;
206
207 /// The profitability analysis.
208 LoopVectorizationCostModel &CM;
209
210 /// The interleaved access analysis.
211 InterleavedAccessInfo &IAI;
212
213 PredicatedScalarEvolution &PSE;
214
215 SmallVector<VPlanPtr, 4> VPlans;
216
217 /// This class is used to enable the VPlan to invoke a method of ILV. This is
218 /// needed until the method is refactored out of ILV and becomes reusable.
219 struct VPCallbackILV : public VPCallback {
220 InnerLoopVectorizer &ILV;
221
222 VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
223
224 Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
225 Value *getOrCreateScalarValue(Value *V,
226 const VPIteration &Instance) override;
227 };
228
229 /// A builder used to construct the current plan.
230 VPBuilder Builder;
231
232 /// The best number of elements of the vector types used in the
233 /// transformed loop. BestVF = None means that vectorization is
234 /// disabled.
235 Optional<ElementCount> BestVF = None;
236 unsigned BestUF = 0;
237
238public:
239 LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
240 const TargetTransformInfo *TTI,
241 LoopVectorizationLegality *Legal,
242 LoopVectorizationCostModel &CM,
243 InterleavedAccessInfo &IAI,
244 PredicatedScalarEvolution &PSE)
245 : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
246 PSE(PSE) {}
247
248 /// Plan how to best vectorize, return the best VF and its cost, or None if
249 /// vectorization and interleaving should be avoided up front.
250 Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
251
252 /// Use the VPlan-native path to plan how to best vectorize, return the best
253 /// VF and its cost.
254 VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
255
256 /// Finalize the best decision and dispose of all other VPlans.
257 void setBestPlan(ElementCount VF, unsigned UF);
258
259 /// Generate the IR code for the body of the vectorized loop according to the
260 /// best selected VPlan.
261 void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
262
263 void printPlans(raw_ostream &O) {
264 for (const auto &Plan : VPlans)
265 O << *Plan;
266 }
267
268 /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
269 /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
270 /// returned value holds for the entire \p Range.
271 static bool
272 getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
273 VFRange &Range);
274
275protected:
276 /// Collect the instructions from the original loop that would be trivially
277 /// dead in the vectorized loop if generated.
278 void collectTriviallyDeadInstructions(
279 SmallPtrSetImpl<Instruction *> &DeadInstructions);
280
281 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
282 /// according to the information gathered by Legal when it checked if it is
283 /// legal to vectorize the loop.
284 void buildVPlans(unsigned MinVF, unsigned MaxVF);
285
286private:
287 /// Build a VPlan according to the information gathered by Legal. \return a
288 /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
289 /// exclusive, possibly decreasing \p Range.End.
290 VPlanPtr buildVPlan(VFRange &Range);
291
292 /// Build a VPlan using VPRecipes according to the information gather by
293 /// Legal. This method is only used for the legacy inner loop vectorizer.
294 VPlanPtr buildVPlanWithVPRecipes(
295 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
296 SmallPtrSetImpl<Instruction *> &DeadInstructions,
297 const DenseMap<Instruction *, Instruction *> &SinkAfter);
298
299 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
300 /// according to the information gathered by Legal when it checked if it is
301 /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
302 void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
303
304 /// Adjust the recipes for any inloop reductions. The chain of instructions
305 /// leading from the loop exit instr to the phi need to be converted to
306 /// reductions, with one operand being vector and the other being the scalar
307 /// reduction chain.
308 void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
309 VPRecipeBuilder &RecipeBuilder);
310};
311
312} // namespace llvm
313
314#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H