Bug Summary

File:lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 5031, column 60
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-9/lib/clang/9.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-9~svn361301/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-9~svn361301/build-llvm/include -I /build/llvm-toolchain-snapshot-9~svn361301/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/9.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-9/lib/clang/9.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-9~svn361301/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-9~svn361301=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2019-05-22-063234-15311-1 -x c++ /build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp -faddrsig
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlanHCFGBuilder.h"
60#include "VPlanHCFGTransforms.h"
61#include "VPlanPredicator.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/None.h"
69#include "llvm/ADT/Optional.h"
70#include "llvm/ADT/STLExtras.h"
71#include "llvm/ADT/SetVector.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/MemorySSA.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpander.h"
94#include "llvm/Analysis/ScalarEvolutionExpressions.h"
95#include "llvm/Analysis/TargetLibraryInfo.h"
96#include "llvm/Analysis/TargetTransformInfo.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/LLVMContext.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/Pass.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/MathExtras.h"
133#include "llvm/Support/raw_ostream.h"
134#include "llvm/Transforms/Utils/BasicBlockUtils.h"
135#include "llvm/Transforms/Utils/LoopSimplify.h"
136#include "llvm/Transforms/Utils/LoopUtils.h"
137#include "llvm/Transforms/Utils/LoopVersioning.h"
138#include "llvm/Transforms/Utils/SizeOpts.h"
139#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
140#include <algorithm>
141#include <cassert>
142#include <cstdint>
143#include <cstdlib>
144#include <functional>
145#include <iterator>
146#include <limits>
147#include <memory>
148#include <string>
149#include <tuple>
150#include <utility>
151#include <vector>
152
153using namespace llvm;
154
155#define LV_NAME"loop-vectorize" "loop-vectorize"
156#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
157
158/// @{
159/// Metadata attribute names
160static const char *const LLVMLoopVectorizeFollowupAll =
161 "llvm.loop.vectorize.followup_all";
162static const char *const LLVMLoopVectorizeFollowupVectorized =
163 "llvm.loop.vectorize.followup_vectorized";
164static const char *const LLVMLoopVectorizeFollowupEpilogue =
165 "llvm.loop.vectorize.followup_epilogue";
166/// @}
167
168STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized", {0}, {false}}
;
169STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization", {0}, {false}}
;
170
171/// Loops with a known constant trip count below this number are vectorized only
172/// if no scalar iteration overheads are incurred.
173static cl::opt<unsigned> TinyTripCountVectorThreshold(
174 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
175 cl::desc("Loops with a constant trip count that is smaller than this "
176 "value are vectorized only if no scalar iteration overheads "
177 "are incurred."));
178
179static cl::opt<bool> MaximizeBandwidth(
180 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
181 cl::desc("Maximize bandwidth when selecting vectorization factor which "
182 "will be determined by the smallest type in loop."));
183
184static cl::opt<bool> EnableInterleavedMemAccesses(
185 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
186 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
187
188/// An interleave-group may need masking if it resides in a block that needs
189/// predication, or in order to mask away gaps.
190static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
191 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
192 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
193
194/// We don't interleave loops with a known constant trip count below this
195/// number.
196static const unsigned TinyTripCountInterleaveThreshold = 128;
197
198static cl::opt<unsigned> ForceTargetNumScalarRegs(
199 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
200 cl::desc("A flag that overrides the target's number of scalar registers."));
201
202static cl::opt<unsigned> ForceTargetNumVectorRegs(
203 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
204 cl::desc("A flag that overrides the target's number of vector registers."));
205
206static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
207 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
208 cl::desc("A flag that overrides the target's max interleave factor for "
209 "scalar loops."));
210
211static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
212 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
213 cl::desc("A flag that overrides the target's max interleave factor for "
214 "vectorized loops."));
215
216static cl::opt<unsigned> ForceTargetInstructionCost(
217 "force-target-instruction-cost", cl::init(0), cl::Hidden,
218 cl::desc("A flag that overrides the target's expected cost for "
219 "an instruction to a single constant value. Mostly "
220 "useful for getting consistent testing."));
221
222static cl::opt<unsigned> SmallLoopCost(
223 "small-loop-cost", cl::init(20), cl::Hidden,
224 cl::desc(
225 "The cost of a loop that is considered 'small' by the interleaver."));
226
227static cl::opt<bool> LoopVectorizeWithBlockFrequency(
228 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
229 cl::desc("Enable the use of the block frequency analysis to access PGO "
230 "heuristics minimizing code growth in cold regions and being more "
231 "aggressive in hot regions."));
232
233// Runtime interleave loops for load/store throughput.
234static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
235 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
236 cl::desc(
237 "Enable runtime interleaving until load/store ports are saturated"));
238
239/// The number of stores in a loop that are allowed to need predication.
240static cl::opt<unsigned> NumberOfStoresToPredicate(
241 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
242 cl::desc("Max number of stores to be predicated behind an if."));
243
244static cl::opt<bool> EnableIndVarRegisterHeur(
245 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
246 cl::desc("Count the induction variable only once when interleaving"));
247
248static cl::opt<bool> EnableCondStoresVectorization(
249 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
250 cl::desc("Enable if predication of stores during vectorization."));
251
252static cl::opt<unsigned> MaxNestedScalarReductionIC(
253 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
254 cl::desc("The maximum interleave count to use when interleaving a scalar "
255 "reduction in a nested loop."));
256
257cl::opt<bool> EnableVPlanNativePath(
258 "enable-vplan-native-path", cl::init(false), cl::Hidden,
259 cl::desc("Enable VPlan-native vectorization path with "
260 "support for outer loop vectorization."));
261
262// FIXME: Remove this switch once we have divergence analysis. Currently we
263// assume divergent non-backedge branches when this switch is true.
264cl::opt<bool> EnableVPlanPredication(
265 "enable-vplan-predication", cl::init(false), cl::Hidden,
266 cl::desc("Enable VPlan-native vectorization path predicator with "
267 "support for outer loop vectorization."));
268
269// This flag enables the stress testing of the VPlan H-CFG construction in the
270// VPlan-native vectorization path. It must be used in conjuction with
271// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
272// verification of the H-CFGs built.
273static cl::opt<bool> VPlanBuildStressTest(
274 "vplan-build-stress-test", cl::init(false), cl::Hidden,
275 cl::desc(
276 "Build VPlan for every supported loop nest in the function and bail "
277 "out right after the build (stress test the VPlan H-CFG construction "
278 "in the VPlan-native vectorization path)."));
279
280cl::opt<bool> llvm::EnableLoopInterleaving(
281 "interleave-loops", cl::init(true), cl::Hidden,
282 cl::desc("Enable loop interleaving in Loop vectorization passes"));
283cl::opt<bool> llvm::EnableLoopVectorization(
284 "vectorize-loops", cl::init(true), cl::Hidden,
285 cl::desc("Run the Loop vectorization passes"));
286
287/// A helper function for converting Scalar types to vector types.
288/// If the incoming type is void, we return void. If the VF is 1, we return
289/// the scalar type.
290static Type *ToVectorTy(Type *Scalar, unsigned VF) {
291 if (Scalar->isVoidTy() || VF == 1)
292 return Scalar;
293 return VectorType::get(Scalar, VF);
294}
295
296/// A helper function that returns the type of loaded or stored value.
297static Type *getMemInstValueType(Value *I) {
298 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 299, __PRETTY_FUNCTION__))
299 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 299, __PRETTY_FUNCTION__))
;
300 if (auto *LI = dyn_cast<LoadInst>(I))
301 return LI->getType();
302 return cast<StoreInst>(I)->getValueOperand()->getType();
303}
304
305/// A helper function that returns true if the given type is irregular. The
306/// type is irregular if its allocated size doesn't equal the store size of an
307/// element of the corresponding vector type at the given vectorization factor.
308static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
309 // Determine if an array of VF elements of type Ty is "bitcast compatible"
310 // with a <VF x Ty> vector.
311 if (VF > 1) {
312 auto *VectorTy = VectorType::get(Ty, VF);
313 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
314 }
315
316 // If the vectorization factor is one, we just check if an array of type Ty
317 // requires padding between elements.
318 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
319}
320
321/// A helper function that returns the reciprocal of the block probability of
322/// predicated blocks. If we return X, we are assuming the predicated block
323/// will execute once for every X iterations of the loop header.
324///
325/// TODO: We should use actual block probability here, if available. Currently,
326/// we always assume predicated blocks have a 50% chance of executing.
327static unsigned getReciprocalPredBlockProb() { return 2; }
328
329/// A helper function that adds a 'fast' flag to floating-point operations.
330static Value *addFastMathFlag(Value *V) {
331 if (isa<FPMathOperator>(V))
332 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
333 return V;
334}
335
336static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
337 if (isa<FPMathOperator>(V))
338 cast<Instruction>(V)->setFastMathFlags(FMF);
339 return V;
340}
341
342/// A helper function that returns an integer or floating-point constant with
343/// value C.
344static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
345 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
346 : ConstantFP::get(Ty, C);
347}
348
349namespace llvm {
350
351/// InnerLoopVectorizer vectorizes loops which contain only one basic
352/// block to a specified vectorization factor (VF).
353/// This class performs the widening of scalars into vectors, or multiple
354/// scalars. This class also implements the following features:
355/// * It inserts an epilogue loop for handling loops that don't have iteration
356/// counts that are known to be a multiple of the vectorization factor.
357/// * It handles the code generation for reduction variables.
358/// * Scalarization (implementation using scalars) of un-vectorizable
359/// instructions.
360/// InnerLoopVectorizer does not perform any vectorization-legality
361/// checks, and relies on the caller to check for the different legality
362/// aspects. The InnerLoopVectorizer relies on the
363/// LoopVectorizationLegality class to provide information about the induction
364/// and reduction variables that were found to a given vectorization factor.
365class InnerLoopVectorizer {
366public:
367 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
368 LoopInfo *LI, DominatorTree *DT,
369 const TargetLibraryInfo *TLI,
370 const TargetTransformInfo *TTI, AssumptionCache *AC,
371 OptimizationRemarkEmitter *ORE, unsigned VecWidth,
372 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
373 LoopVectorizationCostModel *CM)
374 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
375 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
376 Builder(PSE.getSE()->getContext()),
377 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
378 virtual ~InnerLoopVectorizer() = default;
379
380 /// Create a new empty loop. Unlink the old loop and connect the new one.
381 /// Return the pre-header block of the new loop.
382 BasicBlock *createVectorizedLoopSkeleton();
383
384 /// Widen a single instruction within the innermost loop.
385 void widenInstruction(Instruction &I);
386
387 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
388 void fixVectorizedLoop();
389
390 // Return true if any runtime check is added.
391 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
392
393 /// A type for vectorized values in the new loop. Each value from the
394 /// original loop, when vectorized, is represented by UF vector values in the
395 /// new unrolled loop, where UF is the unroll factor.
396 using VectorParts = SmallVector<Value *, 2>;
397
398 /// Vectorize a single PHINode in a block. This method handles the induction
399 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
400 /// arbitrary length vectors.
401 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
402
403 /// A helper function to scalarize a single Instruction in the innermost loop.
404 /// Generates a sequence of scalar instances for each lane between \p MinLane
405 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
406 /// inclusive..
407 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
408 bool IfPredicateInstr);
409
410 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
411 /// is provided, the integer induction variable will first be truncated to
412 /// the corresponding type.
413 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
414
415 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
416 /// vector or scalar value on-demand if one is not yet available. When
417 /// vectorizing a loop, we visit the definition of an instruction before its
418 /// uses. When visiting the definition, we either vectorize or scalarize the
419 /// instruction, creating an entry for it in the corresponding map. (In some
420 /// cases, such as induction variables, we will create both vector and scalar
421 /// entries.) Then, as we encounter uses of the definition, we derive values
422 /// for each scalar or vector use unless such a value is already available.
423 /// For example, if we scalarize a definition and one of its uses is vector,
424 /// we build the required vector on-demand with an insertelement sequence
425 /// when visiting the use. Otherwise, if the use is scalar, we can use the
426 /// existing scalar definition.
427 ///
428 /// Return a value in the new loop corresponding to \p V from the original
429 /// loop at unroll index \p Part. If the value has already been vectorized,
430 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
431 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
432 /// a new vector value on-demand by inserting the scalar values into a vector
433 /// with an insertelement sequence. If the value has been neither vectorized
434 /// nor scalarized, it must be loop invariant, so we simply broadcast the
435 /// value into a vector.
436 Value *getOrCreateVectorValue(Value *V, unsigned Part);
437
438 /// Return a value in the new loop corresponding to \p V from the original
439 /// loop at unroll and vector indices \p Instance. If the value has been
440 /// vectorized but not scalarized, the necessary extractelement instruction
441 /// will be generated.
442 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
443
444 /// Construct the vector value of a scalarized value \p V one lane at a time.
445 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
446
447 /// Try to vectorize the interleaved access group that \p Instr belongs to,
448 /// optionally masking the vector operations if \p BlockInMask is non-null.
449 void vectorizeInterleaveGroup(Instruction *Instr,
450 VectorParts *BlockInMask = nullptr);
451
452 /// Vectorize Load and Store instructions, optionally masking the vector
453 /// operations if \p BlockInMask is non-null.
454 void vectorizeMemoryInstruction(Instruction *Instr,
455 VectorParts *BlockInMask = nullptr);
456
457 /// Set the debug location in the builder using the debug location in
458 /// the instruction.
459 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
460
461 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
462 void fixNonInductionPHIs(void);
463
464protected:
465 friend class LoopVectorizationPlanner;
466
467 /// A small list of PHINodes.
468 using PhiVector = SmallVector<PHINode *, 4>;
469
470 /// A type for scalarized values in the new loop. Each value from the
471 /// original loop, when scalarized, is represented by UF x VF scalar values
472 /// in the new unrolled loop, where UF is the unroll factor and VF is the
473 /// vectorization factor.
474 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
475
476 /// Set up the values of the IVs correctly when exiting the vector loop.
477 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
478 Value *CountRoundDown, Value *EndValue,
479 BasicBlock *MiddleBlock);
480
481 /// Create a new induction variable inside L.
482 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
483 Value *Step, Instruction *DL);
484
485 /// Handle all cross-iteration phis in the header.
486 void fixCrossIterationPHIs();
487
488 /// Fix a first-order recurrence. This is the second phase of vectorizing
489 /// this phi node.
490 void fixFirstOrderRecurrence(PHINode *Phi);
491
492 /// Fix a reduction cross-iteration phi. This is the second phase of
493 /// vectorizing this phi node.
494 void fixReduction(PHINode *Phi);
495
496 /// The Loop exit block may have single value PHI nodes with some
497 /// incoming value. While vectorizing we only handled real values
498 /// that were defined inside the loop and we should have one value for
499 /// each predecessor of its parent basic block. See PR14725.
500 void fixLCSSAPHIs();
501
502 /// Iteratively sink the scalarized operands of a predicated instruction into
503 /// the block that was created for it.
504 void sinkScalarOperands(Instruction *PredInst);
505
506 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
507 /// represented as.
508 void truncateToMinimalBitwidths();
509
510 /// Insert the new loop to the loop hierarchy and pass manager
511 /// and update the analysis passes.
512 void updateAnalysis();
513
514 /// Create a broadcast instruction. This method generates a broadcast
515 /// instruction (shuffle) for loop invariant values and for the induction
516 /// value. If this is the induction variable then we extend it to N, N+1, ...
517 /// this is needed because each iteration in the loop corresponds to a SIMD
518 /// element.
519 virtual Value *getBroadcastInstrs(Value *V);
520
521 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
522 /// to each vector element of Val. The sequence starts at StartIndex.
523 /// \p Opcode is relevant for FP induction variable.
524 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
525 Instruction::BinaryOps Opcode =
526 Instruction::BinaryOpsEnd);
527
528 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
529 /// variable on which to base the steps, \p Step is the size of the step, and
530 /// \p EntryVal is the value from the original loop that maps to the steps.
531 /// Note that \p EntryVal doesn't have to be an induction variable - it
532 /// can also be a truncate instruction.
533 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
534 const InductionDescriptor &ID);
535
536 /// Create a vector induction phi node based on an existing scalar one. \p
537 /// EntryVal is the value from the original loop that maps to the vector phi
538 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
539 /// truncate instruction, instead of widening the original IV, we widen a
540 /// version of the IV truncated to \p EntryVal's type.
541 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
542 Value *Step, Instruction *EntryVal);
543
544 /// Returns true if an instruction \p I should be scalarized instead of
545 /// vectorized for the chosen vectorization factor.
546 bool shouldScalarizeInstruction(Instruction *I) const;
547
548 /// Returns true if we should generate a scalar version of \p IV.
549 bool needsScalarInduction(Instruction *IV) const;
550
551 /// If there is a cast involved in the induction variable \p ID, which should
552 /// be ignored in the vectorized loop body, this function records the
553 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
554 /// cast. We had already proved that the casted Phi is equal to the uncasted
555 /// Phi in the vectorized loop (under a runtime guard), and therefore
556 /// there is no need to vectorize the cast - the same value can be used in the
557 /// vector loop for both the Phi and the cast.
558 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
559 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
560 ///
561 /// \p EntryVal is the value from the original loop that maps to the vector
562 /// phi node and is used to distinguish what is the IV currently being
563 /// processed - original one (if \p EntryVal is a phi corresponding to the
564 /// original IV) or the "newly-created" one based on the proof mentioned above
565 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
566 /// latter case \p EntryVal is a TruncInst and we must not record anything for
567 /// that IV, but it's error-prone to expect callers of this routine to care
568 /// about that, hence this explicit parameter.
569 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
570 const Instruction *EntryVal,
571 Value *VectorLoopValue,
572 unsigned Part,
573 unsigned Lane = UINT_MAX(2147483647 *2U +1U));
574
575 /// Generate a shuffle sequence that will reverse the vector Vec.
576 virtual Value *reverseVector(Value *Vec);
577
578 /// Returns (and creates if needed) the original loop trip count.
579 Value *getOrCreateTripCount(Loop *NewLoop);
580
581 /// Returns (and creates if needed) the trip count of the widened loop.
582 Value *getOrCreateVectorTripCount(Loop *NewLoop);
583
584 /// Returns a bitcasted value to the requested vector type.
585 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
586 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
587 const DataLayout &DL);
588
589 /// Emit a bypass check to see if the vector trip count is zero, including if
590 /// it overflows.
591 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
592
593 /// Emit a bypass check to see if all of the SCEV assumptions we've
594 /// had to make are correct.
595 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
596
597 /// Emit bypass checks to check any memory assumptions we may have made.
598 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
599
600 /// Compute the transformed value of Index at offset StartValue using step
601 /// StepValue.
602 /// For integer induction, returns StartValue + Index * StepValue.
603 /// For pointer induction, returns StartValue[Index * StepValue].
604 /// FIXME: The newly created binary instructions should contain nsw/nuw
605 /// flags, which can be found from the original scalar operations.
606 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
607 const DataLayout &DL,
608 const InductionDescriptor &ID) const;
609
610 /// Add additional metadata to \p To that was not present on \p Orig.
611 ///
612 /// Currently this is used to add the noalias annotations based on the
613 /// inserted memchecks. Use this for instructions that are *cloned* into the
614 /// vector loop.
615 void addNewMetadata(Instruction *To, const Instruction *Orig);
616
617 /// Add metadata from one instruction to another.
618 ///
619 /// This includes both the original MDs from \p From and additional ones (\see
620 /// addNewMetadata). Use this for *newly created* instructions in the vector
621 /// loop.
622 void addMetadata(Instruction *To, Instruction *From);
623
624 /// Similar to the previous function but it adds the metadata to a
625 /// vector of instructions.
626 void addMetadata(ArrayRef<Value *> To, Instruction *From);
627
628 /// The original loop.
629 Loop *OrigLoop;
630
631 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
632 /// dynamic knowledge to simplify SCEV expressions and converts them to a
633 /// more usable form.
634 PredicatedScalarEvolution &PSE;
635
636 /// Loop Info.
637 LoopInfo *LI;
638
639 /// Dominator Tree.
640 DominatorTree *DT;
641
642 /// Alias Analysis.
643 AliasAnalysis *AA;
644
645 /// Target Library Info.
646 const TargetLibraryInfo *TLI;
647
648 /// Target Transform Info.
649 const TargetTransformInfo *TTI;
650
651 /// Assumption Cache.
652 AssumptionCache *AC;
653
654 /// Interface to emit optimization remarks.
655 OptimizationRemarkEmitter *ORE;
656
657 /// LoopVersioning. It's only set up (non-null) if memchecks were
658 /// used.
659 ///
660 /// This is currently only used to add no-alias metadata based on the
661 /// memchecks. The actually versioning is performed manually.
662 std::unique_ptr<LoopVersioning> LVer;
663
664 /// The vectorization SIMD factor to use. Each vector will have this many
665 /// vector elements.
666 unsigned VF;
667
668 /// The vectorization unroll factor to use. Each scalar is vectorized to this
669 /// many different vector instructions.
670 unsigned UF;
671
672 /// The builder that we use
673 IRBuilder<> Builder;
674
675 // --- Vectorization state ---
676
677 /// The vector-loop preheader.
678 BasicBlock *LoopVectorPreHeader;
679
680 /// The scalar-loop preheader.
681 BasicBlock *LoopScalarPreHeader;
682
683 /// Middle Block between the vector and the scalar.
684 BasicBlock *LoopMiddleBlock;
685
686 /// The ExitBlock of the scalar loop.
687 BasicBlock *LoopExitBlock;
688
689 /// The vector loop body.
690 BasicBlock *LoopVectorBody;
691
692 /// The scalar loop body.
693 BasicBlock *LoopScalarBody;
694
695 /// A list of all bypass blocks. The first block is the entry of the loop.
696 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
697
698 /// The new Induction variable which was added to the new block.
699 PHINode *Induction = nullptr;
700
701 /// The induction variable of the old basic block.
702 PHINode *OldInduction = nullptr;
703
704 /// Maps values from the original loop to their corresponding values in the
705 /// vectorized loop. A key value can map to either vector values, scalar
706 /// values or both kinds of values, depending on whether the key was
707 /// vectorized and scalarized.
708 VectorizerValueMap VectorLoopValueMap;
709
710 /// Store instructions that were predicated.
711 SmallVector<Instruction *, 4> PredicatedInstructions;
712
713 /// Trip count of the original loop.
714 Value *TripCount = nullptr;
715
716 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
717 Value *VectorTripCount = nullptr;
718
719 /// The legality analysis.
720 LoopVectorizationLegality *Legal;
721
722 /// The profitablity analysis.
723 LoopVectorizationCostModel *Cost;
724
725 // Record whether runtime checks are added.
726 bool AddedSafetyChecks = false;
727
728 // Holds the end values for each induction variable. We save the end values
729 // so we can later fix-up the external users of the induction variables.
730 DenseMap<PHINode *, Value *> IVEndValues;
731
732 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
733 // fixed up at the end of vector code generation.
734 SmallVector<PHINode *, 8> OrigPHIsToFix;
735};
736
737class InnerLoopUnroller : public InnerLoopVectorizer {
738public:
739 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
740 LoopInfo *LI, DominatorTree *DT,
741 const TargetLibraryInfo *TLI,
742 const TargetTransformInfo *TTI, AssumptionCache *AC,
743 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
744 LoopVectorizationLegality *LVL,
745 LoopVectorizationCostModel *CM)
746 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
747 UnrollFactor, LVL, CM) {}
748
749private:
750 Value *getBroadcastInstrs(Value *V) override;
751 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
752 Instruction::BinaryOps Opcode =
753 Instruction::BinaryOpsEnd) override;
754 Value *reverseVector(Value *Vec) override;
755};
756
757} // end namespace llvm
758
759/// Look for a meaningful debug location on the instruction or it's
760/// operands.
761static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
762 if (!I)
763 return I;
764
765 DebugLoc Empty;
766 if (I->getDebugLoc() != Empty)
767 return I;
768
769 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
770 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
771 if (OpInst->getDebugLoc() != Empty)
772 return OpInst;
773 }
774
775 return I;
776}
777
778void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
779 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
780 const DILocation *DIL = Inst->getDebugLoc();
781 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
782 !isa<DbgInfoIntrinsic>(Inst)) {
783 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
784 if (NewDIL)
785 B.SetCurrentDebugLocation(NewDIL.getValue());
786 else
787 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
788 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
789 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
790 }
791 else
792 B.SetCurrentDebugLocation(DIL);
793 } else
794 B.SetCurrentDebugLocation(DebugLoc());
795}
796
797#ifndef NDEBUG
798/// \return string containing a file name and a line # for the given loop.
799static std::string getDebugLocString(const Loop *L) {
800 std::string Result;
801 if (L) {
802 raw_string_ostream OS(Result);
803 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
804 LoopDbgLoc.print(OS);
805 else
806 // Just print the module name.
807 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
808 OS.flush();
809 }
810 return Result;
811}
812#endif
813
814void InnerLoopVectorizer::addNewMetadata(Instruction *To,
815 const Instruction *Orig) {
816 // If the loop was versioned with memchecks, add the corresponding no-alias
817 // metadata.
818 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
819 LVer->annotateInstWithNoAlias(To, Orig);
820}
821
822void InnerLoopVectorizer::addMetadata(Instruction *To,
823 Instruction *From) {
824 propagateMetadata(To, From);
825 addNewMetadata(To, From);
826}
827
828void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
829 Instruction *From) {
830 for (Value *V : To) {
831 if (Instruction *I = dyn_cast<Instruction>(V))
832 addMetadata(I, From);
833 }
834}
835
836namespace llvm {
837
838/// LoopVectorizationCostModel - estimates the expected speedups due to
839/// vectorization.
840/// In many cases vectorization is not profitable. This can happen because of
841/// a number of reasons. In this class we mainly attempt to predict the
842/// expected speedup/slowdowns due to the supported instruction set. We use the
843/// TargetTransformInfo to query the different backends for the cost of
844/// different operations.
845class LoopVectorizationCostModel {
846public:
847 LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
848 LoopInfo *LI, LoopVectorizationLegality *Legal,
849 const TargetTransformInfo &TTI,
850 const TargetLibraryInfo *TLI, DemandedBits *DB,
851 AssumptionCache *AC,
852 OptimizationRemarkEmitter *ORE, const Function *F,
853 const LoopVectorizeHints *Hints,
854 InterleavedAccessInfo &IAI)
855 : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
856 AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
857
858 /// \return An upper bound for the vectorization factor, or None if
859 /// vectorization and interleaving should be avoided up front.
860 Optional<unsigned> computeMaxVF(bool OptForSize);
861
862 /// \return The most profitable vectorization factor and the cost of that VF.
863 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
864 /// then this vectorization factor will be selected if vectorization is
865 /// possible.
866 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
867
868 /// Setup cost-based decisions for user vectorization factor.
869 void selectUserVectorizationFactor(unsigned UserVF) {
870 collectUniformsAndScalars(UserVF);
871 collectInstsToScalarize(UserVF);
872 }
873
874 /// \return The size (in bits) of the smallest and widest types in the code
875 /// that needs to be vectorized. We ignore values that remain scalar such as
876 /// 64 bit loop indices.
877 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
878
879 /// \return The desired interleave count.
880 /// If interleave count has been specified by metadata it will be returned.
881 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
882 /// are the selected vectorization factor and the cost of the selected VF.
883 unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
884 unsigned LoopCost);
885
886 /// Memory access instruction may be vectorized in more than one way.
887 /// Form of instruction after vectorization depends on cost.
888 /// This function takes cost-based decisions for Load/Store instructions
889 /// and collects them in a map. This decisions map is used for building
890 /// the lists of loop-uniform and loop-scalar instructions.
891 /// The calculated cost is saved with widening decision in order to
892 /// avoid redundant calculations.
893 void setCostBasedWideningDecision(unsigned VF);
894
895 /// A struct that represents some properties of the register usage
896 /// of a loop.
897 struct RegisterUsage {
898 /// Holds the number of loop invariant values that are used in the loop.
899 unsigned LoopInvariantRegs;
900
901 /// Holds the maximum number of concurrent live intervals in the loop.
902 unsigned MaxLocalUsers;
903 };
904
905 /// \return Returns information about the register usages of the loop for the
906 /// given vectorization factors.
907 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
908
909 /// Collect values we want to ignore in the cost model.
910 void collectValuesToIgnore();
911
912 /// \returns The smallest bitwidth each instruction can be represented with.
913 /// The vector equivalents of these instructions should be truncated to this
914 /// type.
915 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
916 return MinBWs;
917 }
918
919 /// \returns True if it is more profitable to scalarize instruction \p I for
920 /// vectorization factor \p VF.
921 bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
922 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.")((VF > 1 && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 922, __PRETTY_FUNCTION__))
;
923
924 // Cost model is not run in the VPlan-native path - return conservative
925 // result until this changes.
926 if (EnableVPlanNativePath)
927 return false;
928
929 auto Scalars = InstsToScalarize.find(VF);
930 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 931, __PRETTY_FUNCTION__))
931 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 931, __PRETTY_FUNCTION__))
;
932 return Scalars->second.find(I) != Scalars->second.end();
933 }
934
935 /// Returns true if \p I is known to be uniform after vectorization.
936 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
937 if (VF == 1)
938 return true;
939
940 // Cost model is not run in the VPlan-native path - return conservative
941 // result until this changes.
942 if (EnableVPlanNativePath)
943 return false;
944
945 auto UniformsPerVF = Uniforms.find(VF);
946 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 947, __PRETTY_FUNCTION__))
947 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 947, __PRETTY_FUNCTION__))
;
948 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
949 }
950
951 /// Returns true if \p I is known to be scalar after vectorization.
952 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
953 if (VF == 1)
954 return true;
955
956 // Cost model is not run in the VPlan-native path - return conservative
957 // result until this changes.
958 if (EnableVPlanNativePath)
959 return false;
960
961 auto ScalarsPerVF = Scalars.find(VF);
962 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 963, __PRETTY_FUNCTION__))
963 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 963, __PRETTY_FUNCTION__))
;
964 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
965 }
966
967 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
968 /// for vectorization factor \p VF.
969 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
970 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
971 !isProfitableToScalarize(I, VF) &&
972 !isScalarAfterVectorization(I, VF);
973 }
974
975 /// Decision that was taken during cost calculation for memory instruction.
976 enum InstWidening {
977 CM_Unknown,
978 CM_Widen, // For consecutive accesses with stride +1.
979 CM_Widen_Reverse, // For consecutive accesses with stride -1.
980 CM_Interleave,
981 CM_GatherScatter,
982 CM_Scalarize
983 };
984
985 /// Save vectorization decision \p W and \p Cost taken by the cost model for
986 /// instruction \p I and vector width \p VF.
987 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
988 unsigned Cost) {
989 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 989, __PRETTY_FUNCTION__))
;
990 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
991 }
992
993 /// Save vectorization decision \p W and \p Cost taken by the cost model for
994 /// interleaving group \p Grp and vector width \p VF.
995 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
996 InstWidening W, unsigned Cost) {
997 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 997, __PRETTY_FUNCTION__))
;
998 /// Broadcast this decicion to all instructions inside the group.
999 /// But the cost will be assigned to one instruction only.
1000 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1001 if (auto *I = Grp->getMember(i)) {
1002 if (Grp->getInsertPos() == I)
1003 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1004 else
1005 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1006 }
1007 }
1008 }
1009
1010 /// Return the cost model decision for the given instruction \p I and vector
1011 /// width \p VF. Return CM_Unknown if this instruction did not pass
1012 /// through the cost modeling.
1013 InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1014 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1014, __PRETTY_FUNCTION__))
;
1015
1016 // Cost model is not run in the VPlan-native path - return conservative
1017 // result until this changes.
1018 if (EnableVPlanNativePath)
1019 return CM_GatherScatter;
1020
1021 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1022 auto Itr = WideningDecisions.find(InstOnVF);
1023 if (Itr == WideningDecisions.end())
1024 return CM_Unknown;
1025 return Itr->second.first;
1026 }
1027
1028 /// Return the vectorization cost for the given instruction \p I and vector
1029 /// width \p VF.
1030 unsigned getWideningCost(Instruction *I, unsigned VF) {
1031 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1031, __PRETTY_FUNCTION__))
;
1032 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1033 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1034, __PRETTY_FUNCTION__))
1034 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1034, __PRETTY_FUNCTION__))
;
1035 return WideningDecisions[InstOnVF].second;
1036 }
1037
1038 /// Return True if instruction \p I is an optimizable truncate whose operand
1039 /// is an induction variable. Such a truncate will be removed by adding a new
1040 /// induction variable with the destination type.
1041 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1042 // If the instruction is not a truncate, return false.
1043 auto *Trunc = dyn_cast<TruncInst>(I);
1044 if (!Trunc)
1045 return false;
1046
1047 // Get the source and destination types of the truncate.
1048 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1049 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1050
1051 // If the truncate is free for the given types, return false. Replacing a
1052 // free truncate with an induction variable would add an induction variable
1053 // update instruction to each iteration of the loop. We exclude from this
1054 // check the primary induction variable since it will need an update
1055 // instruction regardless.
1056 Value *Op = Trunc->getOperand(0);
1057 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1058 return false;
1059
1060 // If the truncated value is not an induction variable, return false.
1061 return Legal->isInductionPhi(Op);
1062 }
1063
1064 /// Collects the instructions to scalarize for each predicated instruction in
1065 /// the loop.
1066 void collectInstsToScalarize(unsigned VF);
1067
1068 /// Collect Uniform and Scalar values for the given \p VF.
1069 /// The sets depend on CM decision for Load/Store instructions
1070 /// that may be vectorized as interleave, gather-scatter or scalarized.
1071 void collectUniformsAndScalars(unsigned VF) {
1072 // Do the analysis once.
1073 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1074 return;
1075 setCostBasedWideningDecision(VF);
1076 collectLoopUniforms(VF);
1077 collectLoopScalars(VF);
1078 }
1079
1080 /// Returns true if the target machine supports masked store operation
1081 /// for the given \p DataType and kind of access to \p Ptr.
1082 bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1083 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1084 }
1085
1086 /// Returns true if the target machine supports masked load operation
1087 /// for the given \p DataType and kind of access to \p Ptr.
1088 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1089 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1090 }
1091
1092 /// Returns true if the target machine supports masked scatter operation
1093 /// for the given \p DataType.
1094 bool isLegalMaskedScatter(Type *DataType) {
1095 return TTI.isLegalMaskedScatter(DataType);
1096 }
1097
1098 /// Returns true if the target machine supports masked gather operation
1099 /// for the given \p DataType.
1100 bool isLegalMaskedGather(Type *DataType) {
1101 return TTI.isLegalMaskedGather(DataType);
1102 }
1103
1104 /// Returns true if the target machine can represent \p V as a masked gather
1105 /// or scatter operation.
1106 bool isLegalGatherOrScatter(Value *V) {
1107 bool LI = isa<LoadInst>(V);
1108 bool SI = isa<StoreInst>(V);
1109 if (!LI && !SI)
1110 return false;
1111 auto *Ty = getMemInstValueType(V);
1112 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1113 }
1114
1115 /// Returns true if \p I is an instruction that will be scalarized with
1116 /// predication. Such instructions include conditional stores and
1117 /// instructions that may divide by zero.
1118 /// If a non-zero VF has been calculated, we check if I will be scalarized
1119 /// predication for that VF.
1120 bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1121
1122 // Returns true if \p I is an instruction that will be predicated either
1123 // through scalar predication or masked load/store or masked gather/scatter.
1124 // Superset of instructions that return true for isScalarWithPredication.
1125 bool isPredicatedInst(Instruction *I) {
1126 if (!blockNeedsPredication(I->getParent()))
1127 return false;
1128 // Loads and stores that need some form of masked operation are predicated
1129 // instructions.
1130 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1131 return Legal->isMaskRequired(I);
1132 return isScalarWithPredication(I);
1133 }
1134
1135 /// Returns true if \p I is a memory instruction with consecutive memory
1136 /// access that can be widened.
1137 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1138
1139 /// Returns true if \p I is a memory instruction in an interleaved-group
1140 /// of memory accesses that can be vectorized with wide vector loads/stores
1141 /// and shuffles.
1142 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1143
1144 /// Check if \p Instr belongs to any interleaved access group.
1145 bool isAccessInterleaved(Instruction *Instr) {
1146 return InterleaveInfo.isInterleaved(Instr);
1147 }
1148
1149 /// Get the interleaved access group that \p Instr belongs to.
1150 const InterleaveGroup<Instruction> *
1151 getInterleavedAccessGroup(Instruction *Instr) {
1152 return InterleaveInfo.getInterleaveGroup(Instr);
1153 }
1154
1155 /// Returns true if an interleaved group requires a scalar iteration
1156 /// to handle accesses with gaps, and there is nothing preventing us from
1157 /// creating a scalar epilogue.
1158 bool requiresScalarEpilogue() const {
1159 return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1160 }
1161
1162 /// Returns true if a scalar epilogue is not allowed due to optsize.
1163 bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1164
1165 /// Returns true if all loop blocks should be masked to fold tail loop.
1166 bool foldTailByMasking() const { return FoldTailByMasking; }
1167
1168 bool blockNeedsPredication(BasicBlock *BB) {
1169 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1170 }
1171
1172 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1173 /// with factor VF. Return the cost of the instruction, including
1174 /// scalarization overhead if it's needed.
1175 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1176
1177 /// Estimate cost of a call instruction CI if it were vectorized with factor
1178 /// VF. Return the cost of the instruction, including scalarization overhead
1179 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1180 /// scalarized -
1181 // i.e. either vector version isn't available, or is too expensive.
1182 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1183
1184private:
1185 unsigned NumPredStores = 0;
1186
1187 /// \return An upper bound for the vectorization factor, larger than zero.
1188 /// One is returned if vectorization should best be avoided due to cost.
1189 unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1190
1191 /// The vectorization cost is a combination of the cost itself and a boolean
1192 /// indicating whether any of the contributing operations will actually
1193 /// operate on
1194 /// vector values after type legalization in the backend. If this latter value
1195 /// is
1196 /// false, then all operations will be scalarized (i.e. no vectorization has
1197 /// actually taken place).
1198 using VectorizationCostTy = std::pair<unsigned, bool>;
1199
1200 /// Returns the expected execution cost. The unit of the cost does
1201 /// not matter because we use the 'cost' units to compare different
1202 /// vector widths. The cost that is returned is *not* normalized by
1203 /// the factor width.
1204 VectorizationCostTy expectedCost(unsigned VF);
1205
1206 /// Returns the execution time cost of an instruction for a given vector
1207 /// width. Vector width of one means scalar.
1208 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1209
1210 /// The cost-computation logic from getInstructionCost which provides
1211 /// the vector type as an output parameter.
1212 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1213
1214 /// Calculate vectorization cost of memory instruction \p I.
1215 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1216
1217 /// The cost computation for scalarized memory instruction.
1218 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1219
1220 /// The cost computation for interleaving group of memory instructions.
1221 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1222
1223 /// The cost computation for Gather/Scatter instruction.
1224 unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1225
1226 /// The cost computation for widening instruction \p I with consecutive
1227 /// memory access.
1228 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1229
1230 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1231 /// Load: scalar load + broadcast.
1232 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1233 /// element)
1234 unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1235
1236 /// Estimate the overhead of scalarizing an instruction. This is a
1237 /// convenience wrapper for the type-based getScalarizationOverhead API.
1238 unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1239
1240 /// Returns whether the instruction is a load or store and will be a emitted
1241 /// as a vector operation.
1242 bool isConsecutiveLoadOrStore(Instruction *I);
1243
1244 /// Returns true if an artificially high cost for emulated masked memrefs
1245 /// should be used.
1246 bool useEmulatedMaskMemRefHack(Instruction *I);
1247
1248 /// Create an analysis remark that explains why vectorization failed
1249 ///
1250 /// \p RemarkName is the identifier for the remark. \return the remark object
1251 /// that can be streamed to.
1252 OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1253 return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1254 RemarkName, TheLoop);
1255 }
1256
1257 /// Map of scalar integer values to the smallest bitwidth they can be legally
1258 /// represented as. The vector equivalents of these values should be truncated
1259 /// to this type.
1260 MapVector<Instruction *, uint64_t> MinBWs;
1261
1262 /// A type representing the costs for instructions if they were to be
1263 /// scalarized rather than vectorized. The entries are Instruction-Cost
1264 /// pairs.
1265 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1266
1267 /// A set containing all BasicBlocks that are known to present after
1268 /// vectorization as a predicated block.
1269 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1270
1271 /// Records whether it is allowed to have the original scalar loop execute at
1272 /// least once. This may be needed as a fallback loop in case runtime
1273 /// aliasing/dependence checks fail, or to handle the tail/remainder
1274 /// iterations when the trip count is unknown or doesn't divide by the VF,
1275 /// or as a peel-loop to handle gaps in interleave-groups.
1276 /// Under optsize and when the trip count is very small we don't allow any
1277 /// iterations to execute in the scalar loop.
1278 bool IsScalarEpilogueAllowed = true;
1279
1280 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1281 bool FoldTailByMasking = false;
1282
1283 /// A map holding scalar costs for different vectorization factors. The
1284 /// presence of a cost for an instruction in the mapping indicates that the
1285 /// instruction will be scalarized when vectorizing with the associated
1286 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1287 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1288
1289 /// Holds the instructions known to be uniform after vectorization.
1290 /// The data is collected per VF.
1291 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1292
1293 /// Holds the instructions known to be scalar after vectorization.
1294 /// The data is collected per VF.
1295 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1296
1297 /// Holds the instructions (address computations) that are forced to be
1298 /// scalarized.
1299 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1300
1301 /// Returns the expected difference in cost from scalarizing the expression
1302 /// feeding a predicated instruction \p PredInst. The instructions to
1303 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1304 /// non-negative return value implies the expression will be scalarized.
1305 /// Currently, only single-use chains are considered for scalarization.
1306 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1307 unsigned VF);
1308
1309 /// Collect the instructions that are uniform after vectorization. An
1310 /// instruction is uniform if we represent it with a single scalar value in
1311 /// the vectorized loop corresponding to each vector iteration. Examples of
1312 /// uniform instructions include pointer operands of consecutive or
1313 /// interleaved memory accesses. Note that although uniformity implies an
1314 /// instruction will be scalar, the reverse is not true. In general, a
1315 /// scalarized instruction will be represented by VF scalar values in the
1316 /// vectorized loop, each corresponding to an iteration of the original
1317 /// scalar loop.
1318 void collectLoopUniforms(unsigned VF);
1319
1320 /// Collect the instructions that are scalar after vectorization. An
1321 /// instruction is scalar if it is known to be uniform or will be scalarized
1322 /// during vectorization. Non-uniform scalarized instructions will be
1323 /// represented by VF values in the vectorized loop, each corresponding to an
1324 /// iteration of the original scalar loop.
1325 void collectLoopScalars(unsigned VF);
1326
1327 /// Keeps cost model vectorization decision and cost for instructions.
1328 /// Right now it is used for memory instructions only.
1329 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1330 std::pair<InstWidening, unsigned>>;
1331
1332 DecisionList WideningDecisions;
1333
1334public:
1335 /// The loop that we evaluate.
1336 Loop *TheLoop;
1337
1338 /// Predicated scalar evolution analysis.
1339 PredicatedScalarEvolution &PSE;
1340
1341 /// Loop Info analysis.
1342 LoopInfo *LI;
1343
1344 /// Vectorization legality.
1345 LoopVectorizationLegality *Legal;
1346
1347 /// Vector target information.
1348 const TargetTransformInfo &TTI;
1349
1350 /// Target Library Info.
1351 const TargetLibraryInfo *TLI;
1352
1353 /// Demanded bits analysis.
1354 DemandedBits *DB;
1355
1356 /// Assumption cache.
1357 AssumptionCache *AC;
1358
1359 /// Interface to emit optimization remarks.
1360 OptimizationRemarkEmitter *ORE;
1361
1362 const Function *TheFunction;
1363
1364 /// Loop Vectorize Hint.
1365 const LoopVectorizeHints *Hints;
1366
1367 /// The interleave access information contains groups of interleaved accesses
1368 /// with the same stride and close to each other.
1369 InterleavedAccessInfo &InterleaveInfo;
1370
1371 /// Values to ignore in the cost model.
1372 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1373
1374 /// Values to ignore in the cost model when VF > 1.
1375 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1376};
1377
1378} // end namespace llvm
1379
1380// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1381// vectorization. The loop needs to be annotated with #pragma omp simd
1382// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1383// vector length information is not provided, vectorization is not considered
1384// explicit. Interleave hints are not allowed either. These limitations will be
1385// relaxed in the future.
1386// Please, note that we are currently forced to abuse the pragma 'clang
1387// vectorize' semantics. This pragma provides *auto-vectorization hints*
1388// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1389// provides *explicit vectorization hints* (LV can bypass legal checks and
1390// assume that vectorization is legal). However, both hints are implemented
1391// using the same metadata (llvm.loop.vectorize, processed by
1392// LoopVectorizeHints). This will be fixed in the future when the native IR
1393// representation for pragma 'omp simd' is introduced.
1394static bool isExplicitVecOuterLoop(Loop *OuterLp,
1395 OptimizationRemarkEmitter *ORE) {
1396 assert(!OuterLp->empty() && "This is not an outer loop")((!OuterLp->empty() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->empty() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1396, __PRETTY_FUNCTION__))
;
1397 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1398
1399 // Only outer loops with an explicit vectorization hint are supported.
1400 // Unannotated outer loops are ignored.
1401 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1402 return false;
1403
1404 Function *Fn = OuterLp->getHeader()->getParent();
1405 if (!Hints.allowVectorization(Fn, OuterLp,
1406 true /*VectorizeOnlyWhenForced*/)) {
1407 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
1408 return false;
1409 }
1410
1411 if (Hints.getInterleave() > 1) {
1412 // TODO: Interleave support is future work.
1413 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
1414 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
1415 Hints.emitRemarkWithHints();
1416 return false;
1417 }
1418
1419 return true;
1420}
1421
1422static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1423 OptimizationRemarkEmitter *ORE,
1424 SmallVectorImpl<Loop *> &V) {
1425 // Collect inner loops and outer loops without irreducible control flow. For
1426 // now, only collect outer loops that have explicit vectorization hints. If we
1427 // are stress testing the VPlan H-CFG construction, we collect the outermost
1428 // loop of every loop nest.
1429 if (L.empty() || VPlanBuildStressTest ||
1430 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1431 LoopBlocksRPO RPOT(&L);
1432 RPOT.perform(LI);
1433 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1434 V.push_back(&L);
1435 // TODO: Collect inner loops inside marked outer loops in case
1436 // vectorization fails for the outer loop. Do not invoke
1437 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1438 // already known to be reducible. We can use an inherited attribute for
1439 // that.
1440 return;
1441 }
1442 }
1443 for (Loop *InnerL : L)
1444 collectSupportedLoops(*InnerL, LI, ORE, V);
1445}
1446
1447namespace {
1448
1449/// The LoopVectorize Pass.
1450struct LoopVectorize : public FunctionPass {
1451 /// Pass identification, replacement for typeid
1452 static char ID;
1453
1454 LoopVectorizePass Impl;
1455
1456 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1457 bool VectorizeOnlyWhenForced = false)
1458 : FunctionPass(ID) {
1459 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1460 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1461 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1462 }
1463
1464 bool runOnFunction(Function &F) override {
1465 if (skipFunction(F))
1466 return false;
1467
1468 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1469 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1470 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1471 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1472 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1473 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1474 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1475 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1476 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1477 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1478 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1479 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1480 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1481
1482 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1483 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1484
1485 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1486 GetLAA, *ORE, PSI);
1487 }
1488
1489 void getAnalysisUsage(AnalysisUsage &AU) const override {
1490 AU.addRequired<AssumptionCacheTracker>();
1491 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1492 AU.addRequired<DominatorTreeWrapperPass>();
1493 AU.addRequired<LoopInfoWrapperPass>();
1494 AU.addRequired<ScalarEvolutionWrapperPass>();
1495 AU.addRequired<TargetTransformInfoWrapperPass>();
1496 AU.addRequired<AAResultsWrapperPass>();
1497 AU.addRequired<LoopAccessLegacyAnalysis>();
1498 AU.addRequired<DemandedBitsWrapperPass>();
1499 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1500
1501 // We currently do not preserve loopinfo/dominator analyses with outer loop
1502 // vectorization. Until this is addressed, mark these analyses as preserved
1503 // only for non-VPlan-native path.
1504 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1505 if (!EnableVPlanNativePath) {
1506 AU.addPreserved<LoopInfoWrapperPass>();
1507 AU.addPreserved<DominatorTreeWrapperPass>();
1508 }
1509
1510 AU.addPreserved<BasicAAWrapperPass>();
1511 AU.addPreserved<GlobalsAAWrapperPass>();
1512 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1513 }
1514};
1515
1516} // end anonymous namespace
1517
1518//===----------------------------------------------------------------------===//
1519// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1520// LoopVectorizationCostModel and LoopVectorizationPlanner.
1521//===----------------------------------------------------------------------===//
1522
1523Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1524 // We need to place the broadcast of invariant variables outside the loop,
1525 // but only if it's proven safe to do so. Else, broadcast will be inside
1526 // vector loop body.
1527 Instruction *Instr = dyn_cast<Instruction>(V);
1528 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1529 (!Instr ||
1530 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1531 // Place the code for broadcasting invariant variables in the new preheader.
1532 IRBuilder<>::InsertPointGuard Guard(Builder);
1533 if (SafeToHoist)
1534 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1535
1536 // Broadcast the scalar into all locations in the vector.
1537 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1538
1539 return Shuf;
1540}
1541
1542void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1543 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1544 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1545, __PRETTY_FUNCTION__))
1545 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1545, __PRETTY_FUNCTION__))
;
1546 Value *Start = II.getStartValue();
1547
1548 // Construct the initial value of the vector IV in the vector loop preheader
1549 auto CurrIP = Builder.saveIP();
1550 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1551 if (isa<TruncInst>(EntryVal)) {
1552 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1553, __PRETTY_FUNCTION__))
1553 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1553, __PRETTY_FUNCTION__))
;
1554 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1555 Step = Builder.CreateTrunc(Step, TruncType);
1556 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1557 }
1558 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1559 Value *SteppedStart =
1560 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1561
1562 // We create vector phi nodes for both integer and floating-point induction
1563 // variables. Here, we determine the kind of arithmetic we will perform.
1564 Instruction::BinaryOps AddOp;
1565 Instruction::BinaryOps MulOp;
1566 if (Step->getType()->isIntegerTy()) {
1567 AddOp = Instruction::Add;
1568 MulOp = Instruction::Mul;
1569 } else {
1570 AddOp = II.getInductionOpcode();
1571 MulOp = Instruction::FMul;
1572 }
1573
1574 // Multiply the vectorization factor by the step using integer or
1575 // floating-point arithmetic as appropriate.
1576 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1577 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1578
1579 // Create a vector splat to use in the induction update.
1580 //
1581 // FIXME: If the step is non-constant, we create the vector splat with
1582 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1583 // handle a constant vector splat.
1584 Value *SplatVF = isa<Constant>(Mul)
1585 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1586 : Builder.CreateVectorSplat(VF, Mul);
1587 Builder.restoreIP(CurrIP);
1588
1589 // We may need to add the step a number of times, depending on the unroll
1590 // factor. The last of those goes into the PHI.
1591 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1592 &*LoopVectorBody->getFirstInsertionPt());
1593 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1594 Instruction *LastInduction = VecInd;
1595 for (unsigned Part = 0; Part < UF; ++Part) {
1596 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1597
1598 if (isa<TruncInst>(EntryVal))
1599 addMetadata(LastInduction, EntryVal);
1600 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1601
1602 LastInduction = cast<Instruction>(addFastMathFlag(
1603 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1604 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1605 }
1606
1607 // Move the last step to the end of the latch block. This ensures consistent
1608 // placement of all induction updates.
1609 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1610 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1611 auto *ICmp = cast<Instruction>(Br->getCondition());
1612 LastInduction->moveBefore(ICmp);
1613 LastInduction->setName("vec.ind.next");
1614
1615 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1616 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1617}
1618
1619bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1620 return Cost->isScalarAfterVectorization(I, VF) ||
1621 Cost->isProfitableToScalarize(I, VF);
1622}
1623
1624bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1625 if (shouldScalarizeInstruction(IV))
1626 return true;
1627 auto isScalarInst = [&](User *U) -> bool {
1628 auto *I = cast<Instruction>(U);
1629 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1630 };
1631 return llvm::any_of(IV->users(), isScalarInst);
1632}
1633
1634void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1635 const InductionDescriptor &ID, const Instruction *EntryVal,
1636 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1637 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1638, __PRETTY_FUNCTION__))
1638 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1638, __PRETTY_FUNCTION__))
;
1639
1640 // This induction variable is not the phi from the original loop but the
1641 // newly-created IV based on the proof that casted Phi is equal to the
1642 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1643 // re-uses the same InductionDescriptor that original IV uses but we don't
1644 // have to do any recording in this case - that is done when original IV is
1645 // processed.
1646 if (isa<TruncInst>(EntryVal))
1647 return;
1648
1649 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1650 if (Casts.empty())
1651 return;
1652 // Only the first Cast instruction in the Casts vector is of interest.
1653 // The rest of the Casts (if exist) have no uses outside the
1654 // induction update chain itself.
1655 Instruction *CastInst = *Casts.begin();
1656 if (Lane < UINT_MAX(2147483647 *2U +1U))
1657 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1658 else
1659 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1660}
1661
1662void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1663 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1664, __PRETTY_FUNCTION__))
1664 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1664, __PRETTY_FUNCTION__))
;
1665
1666 auto II = Legal->getInductionVars()->find(IV);
1667 assert(II != Legal->getInductionVars()->end() && "IV is not an induction")((II != Legal->getInductionVars()->end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars()->end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1667, __PRETTY_FUNCTION__))
;
1668
1669 auto ID = II->second;
1670 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1670, __PRETTY_FUNCTION__))
;
1671
1672 // The scalar value to broadcast. This will be derived from the canonical
1673 // induction variable.
1674 Value *ScalarIV = nullptr;
1675
1676 // The value from the original loop to which we are mapping the new induction
1677 // variable.
1678 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1679
1680 // True if we have vectorized the induction variable.
1681 auto VectorizedIV = false;
1682
1683 // Determine if we want a scalar version of the induction variable. This is
1684 // true if the induction variable itself is not widened, or if it has at
1685 // least one user in the loop that is not widened.
1686 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1687
1688 // Generate code for the induction step. Note that induction steps are
1689 // required to be loop-invariant
1690 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
"Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1691, __PRETTY_FUNCTION__))
1691 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
"Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1691, __PRETTY_FUNCTION__))
;
1692 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1693 Value *Step = nullptr;
1694 if (PSE.getSE()->isSCEVable(IV->getType())) {
1695 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1696 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1697 LoopVectorPreHeader->getTerminator());
1698 } else {
1699 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1700 }
1701
1702 // Try to create a new independent vector induction variable. If we can't
1703 // create the phi node, we will splat the scalar induction variable in each
1704 // loop iteration.
1705 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1706 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1707 VectorizedIV = true;
1708 }
1709
1710 // If we haven't yet vectorized the induction variable, or if we will create
1711 // a scalar one, we need to define the scalar induction variable and step
1712 // values. If we were given a truncation type, truncate the canonical
1713 // induction variable and step. Otherwise, derive these values from the
1714 // induction descriptor.
1715 if (!VectorizedIV || NeedsScalarIV) {
1716 ScalarIV = Induction;
1717 if (IV != OldInduction) {
1718 ScalarIV = IV->getType()->isIntegerTy()
1719 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1720 : Builder.CreateCast(Instruction::SIToFP, Induction,
1721 IV->getType());
1722 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1723 ScalarIV->setName("offset.idx");
1724 }
1725 if (Trunc) {
1726 auto *TruncType = cast<IntegerType>(Trunc->getType());
1727 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1728, __PRETTY_FUNCTION__))
1728 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1728, __PRETTY_FUNCTION__))
;
1729 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1730 Step = Builder.CreateTrunc(Step, TruncType);
1731 }
1732 }
1733
1734 // If we haven't yet vectorized the induction variable, splat the scalar
1735 // induction variable, and build the necessary step vectors.
1736 // TODO: Don't do it unless the vectorized IV is really required.
1737 if (!VectorizedIV) {
1738 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1739 for (unsigned Part = 0; Part < UF; ++Part) {
1740 Value *EntryPart =
1741 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1742 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1743 if (Trunc)
1744 addMetadata(EntryPart, Trunc);
1745 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1746 }
1747 }
1748
1749 // If an induction variable is only used for counting loop iterations or
1750 // calculating addresses, it doesn't need to be widened. Create scalar steps
1751 // that can be used by instructions we will later scalarize. Note that the
1752 // addition of the scalar steps will not increase the number of instructions
1753 // in the loop in the common case prior to InstCombine. We will be trading
1754 // one vector extract for each scalar step.
1755 if (NeedsScalarIV)
1756 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1757}
1758
1759Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1760 Instruction::BinaryOps BinOp) {
1761 // Create and check the types.
1762 assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector"
) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1762, __PRETTY_FUNCTION__))
;
1763 int VLen = Val->getType()->getVectorNumElements();
1764
1765 Type *STy = Val->getType()->getScalarType();
1766 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1767, __PRETTY_FUNCTION__))
1767 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1767, __PRETTY_FUNCTION__))
;
1768 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1768, __PRETTY_FUNCTION__))
;
1769
1770 SmallVector<Constant *, 8> Indices;
1771
1772 if (STy->isIntegerTy()) {
1773 // Create a vector of consecutive numbers from zero to VF.
1774 for (int i = 0; i < VLen; ++i)
1775 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1776
1777 // Add the consecutive indices to the vector value.
1778 Constant *Cv = ConstantVector::get(Indices);
1779 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1779, __PRETTY_FUNCTION__))
;
1780 Step = Builder.CreateVectorSplat(VLen, Step);
1781 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1781, __PRETTY_FUNCTION__))
;
1782 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1783 // which can be found from the original scalar operations.
1784 Step = Builder.CreateMul(Cv, Step);
1785 return Builder.CreateAdd(Val, Step, "induction");
1786 }
1787
1788 // Floating point induction.
1789 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1790, __PRETTY_FUNCTION__))
1790 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1790, __PRETTY_FUNCTION__))
;
1791 // Create a vector of consecutive numbers from zero to VF.
1792 for (int i = 0; i < VLen; ++i)
1793 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1794
1795 // Add the consecutive indices to the vector value.
1796 Constant *Cv = ConstantVector::get(Indices);
1797
1798 Step = Builder.CreateVectorSplat(VLen, Step);
1799
1800 // Floating point operations had to be 'fast' to enable the induction.
1801 FastMathFlags Flags;
1802 Flags.setFast();
1803
1804 Value *MulOp = Builder.CreateFMul(Cv, Step);
1805 if (isa<Instruction>(MulOp))
1806 // Have to check, MulOp may be a constant
1807 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1808
1809 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1810 if (isa<Instruction>(BOp))
1811 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1812 return BOp;
1813}
1814
1815void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1816 Instruction *EntryVal,
1817 const InductionDescriptor &ID) {
1818 // We shouldn't have to build scalar steps if we aren't vectorizing.
1819 assert(VF > 1 && "VF should be greater than one")((VF > 1 && "VF should be greater than one") ? static_cast
<void> (0) : __assert_fail ("VF > 1 && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1819, __PRETTY_FUNCTION__))
;
1820
1821 // Get the value type and ensure it and the step have the same integer type.
1822 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1823 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
1824 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
;
1825
1826 // We build scalar steps for both integer and floating-point induction
1827 // variables. Here, we determine the kind of arithmetic we will perform.
1828 Instruction::BinaryOps AddOp;
1829 Instruction::BinaryOps MulOp;
1830 if (ScalarIVTy->isIntegerTy()) {
1831 AddOp = Instruction::Add;
1832 MulOp = Instruction::Mul;
1833 } else {
1834 AddOp = ID.getInductionOpcode();
1835 MulOp = Instruction::FMul;
1836 }
1837
1838 // Determine the number of scalars we need to generate for each unroll
1839 // iteration. If EntryVal is uniform, we only need to generate the first
1840 // lane. Otherwise, we generate all VF values.
1841 unsigned Lanes =
1842 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1843 : VF;
1844 // Compute the scalar steps and save the results in VectorLoopValueMap.
1845 for (unsigned Part = 0; Part < UF; ++Part) {
1846 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1847 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1848 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1849 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1850 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1851 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1852 }
1853 }
1854}
1855
1856Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1857 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1857, __PRETTY_FUNCTION__))
;
1858 assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1858, __PRETTY_FUNCTION__))
;
1859 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1859, __PRETTY_FUNCTION__))
;
1860
1861 // If we have a stride that is replaced by one, do it here. Defer this for
1862 // the VPlan-native path until we start running Legal checks in that path.
1863 if (!EnableVPlanNativePath && Legal->hasStride(V))
1864 V = ConstantInt::get(V->getType(), 1);
1865
1866 // If we have a vector mapped to this value, return it.
1867 if (VectorLoopValueMap.hasVectorValue(V, Part))
1868 return VectorLoopValueMap.getVectorValue(V, Part);
1869
1870 // If the value has not been vectorized, check if it has been scalarized
1871 // instead. If it has been scalarized, and we actually need the value in
1872 // vector form, we will construct the vector values on demand.
1873 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1874 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1875
1876 // If we've scalarized a value, that value should be an instruction.
1877 auto *I = cast<Instruction>(V);
1878
1879 // If we aren't vectorizing, we can just copy the scalar map values over to
1880 // the vector map.
1881 if (VF == 1) {
1882 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1883 return ScalarValue;
1884 }
1885
1886 // Get the last scalar instruction we generated for V and Part. If the value
1887 // is known to be uniform after vectorization, this corresponds to lane zero
1888 // of the Part unroll iteration. Otherwise, the last instruction is the one
1889 // we created for the last vector lane of the Part unroll iteration.
1890 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1891 auto *LastInst = cast<Instruction>(
1892 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1893
1894 // Set the insert point after the last scalarized instruction. This ensures
1895 // the insertelement sequence will directly follow the scalar definitions.
1896 auto OldIP = Builder.saveIP();
1897 auto NewIP = std::next(BasicBlock::iterator(LastInst));
1898 Builder.SetInsertPoint(&*NewIP);
1899
1900 // However, if we are vectorizing, we need to construct the vector values.
1901 // If the value is known to be uniform after vectorization, we can just
1902 // broadcast the scalar value corresponding to lane zero for each unroll
1903 // iteration. Otherwise, we construct the vector values using insertelement
1904 // instructions. Since the resulting vectors are stored in
1905 // VectorLoopValueMap, we will only generate the insertelements once.
1906 Value *VectorValue = nullptr;
1907 if (Cost->isUniformAfterVectorization(I, VF)) {
1908 VectorValue = getBroadcastInstrs(ScalarValue);
1909 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1910 } else {
1911 // Initialize packing with insertelements to start from undef.
1912 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
1913 VectorLoopValueMap.setVectorValue(V, Part, Undef);
1914 for (unsigned Lane = 0; Lane < VF; ++Lane)
1915 packScalarIntoVectorValue(V, {Part, Lane});
1916 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1917 }
1918 Builder.restoreIP(OldIP);
1919 return VectorValue;
1920 }
1921
1922 // If this scalar is unknown, assume that it is a constant or that it is
1923 // loop invariant. Broadcast V and save the value for future uses.
1924 Value *B = getBroadcastInstrs(V);
1925 VectorLoopValueMap.setVectorValue(V, Part, B);
1926 return B;
1927}
1928
1929Value *
1930InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
1931 const VPIteration &Instance) {
1932 // If the value is not an instruction contained in the loop, it should
1933 // already be scalar.
1934 if (OrigLoop->isLoopInvariant(V))
1935 return V;
1936
1937 assert(Instance.Lane > 0((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1939, __PRETTY_FUNCTION__))
1938 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1939, __PRETTY_FUNCTION__))
1939 : true && "Uniform values only have lane zero")((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1939, __PRETTY_FUNCTION__))
;
1940
1941 // If the value from the original loop has not been vectorized, it is
1942 // represented by UF x VF scalar values in the new loop. Return the requested
1943 // scalar value.
1944 if (VectorLoopValueMap.hasScalarValue(V, Instance))
1945 return VectorLoopValueMap.getScalarValue(V, Instance);
1946
1947 // If the value has not been scalarized, get its entry in VectorLoopValueMap
1948 // for the given unroll part. If this entry is not a vector type (i.e., the
1949 // vectorization factor is one), there is no need to generate an
1950 // extractelement instruction.
1951 auto *U = getOrCreateVectorValue(V, Instance.Part);
1952 if (!U->getType()->isVectorTy()) {
1953 assert(VF == 1 && "Value not scalarized has non-vector type")((VF == 1 && "Value not scalarized has non-vector type"
) ? static_cast<void> (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1953, __PRETTY_FUNCTION__))
;
1954 return U;
1955 }
1956
1957 // Otherwise, the value from the original loop has been vectorized and is
1958 // represented by UF vector values. Extract and return the requested scalar
1959 // value from the appropriate vector lane.
1960 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1961}
1962
1963void InnerLoopVectorizer::packScalarIntoVectorValue(
1964 Value *V, const VPIteration &Instance) {
1965 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1965, __PRETTY_FUNCTION__))
;
1966 assert(!V->getType()->isVectorTy() && "Can't pack a vector")((!V->getType()->isVectorTy() && "Can't pack a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1966, __PRETTY_FUNCTION__))
;
1967 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1967, __PRETTY_FUNCTION__))
;
1968
1969 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1970 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1971 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1972 Builder.getInt32(Instance.Lane));
1973 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1974}
1975
1976Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
1977 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1977, __PRETTY_FUNCTION__))
;
1978 SmallVector<Constant *, 8> ShuffleMask;
1979 for (unsigned i = 0; i < VF; ++i)
1980 ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1981
1982 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1983 ConstantVector::get(ShuffleMask),
1984 "reverse");
1985}
1986
1987// Return whether we allow using masked interleave-groups (for dealing with
1988// strided loads/stores that reside in predicated blocks, or for dealing
1989// with gaps).
1990static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
1991 // If an override option has been passed in for interleaved accesses, use it.
1992 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1993 return EnableMaskedInterleavedMemAccesses;
1994
1995 return TTI.enableMaskedInterleavedAccessVectorization();
1996}
1997
1998// Try to vectorize the interleave group that \p Instr belongs to.
1999//
2000// E.g. Translate following interleaved load group (factor = 3):
2001// for (i = 0; i < N; i+=3) {
2002// R = Pic[i]; // Member of index 0
2003// G = Pic[i+1]; // Member of index 1
2004// B = Pic[i+2]; // Member of index 2
2005// ... // do something to R, G, B
2006// }
2007// To:
2008// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2009// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2010// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2011// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2012//
2013// Or translate following interleaved store group (factor = 3):
2014// for (i = 0; i < N; i+=3) {
2015// ... do something to R, G, B
2016// Pic[i] = R; // Member of index 0
2017// Pic[i+1] = G; // Member of index 1
2018// Pic[i+2] = B; // Member of index 2
2019// }
2020// To:
2021// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2022// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2023// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2024// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2025// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2026void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2027 VectorParts *BlockInMask) {
2028 const InterleaveGroup<Instruction> *Group =
2029 Cost->getInterleavedAccessGroup(Instr);
2030 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2030, __PRETTY_FUNCTION__))
;
2031
2032 // Skip if current instruction is not the insert position.
2033 if (Instr != Group->getInsertPos())
2034 return;
2035
2036 const DataLayout &DL = Instr->getModule()->getDataLayout();
2037 Value *Ptr = getLoadStorePointerOperand(Instr);
2038
2039 // Prepare for the vector type of the interleaved load/store.
2040 Type *ScalarTy = getMemInstValueType(Instr);
2041 unsigned InterleaveFactor = Group->getFactor();
2042 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2043 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2044
2045 // Prepare for the new pointers.
2046 setDebugLocFromInst(Builder, Ptr);
2047 SmallVector<Value *, 2> NewPtrs;
2048 unsigned Index = Group->getIndex(Instr);
2049
2050 VectorParts Mask;
2051 bool IsMaskForCondRequired = BlockInMask;
2052 if (IsMaskForCondRequired) {
2053 Mask = *BlockInMask;
2054 // TODO: extend the masked interleaved-group support to reversed access.
2055 assert(!Group->isReverse() && "Reversed masked interleave-group "((!Group->isReverse() && "Reversed masked interleave-group "
"not supported.") ? static_cast<void> (0) : __assert_fail
("!Group->isReverse() && \"Reversed masked interleave-group \" \"not supported.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2056, __PRETTY_FUNCTION__))
2056 "not supported.")((!Group->isReverse() && "Reversed masked interleave-group "
"not supported.") ? static_cast<void> (0) : __assert_fail
("!Group->isReverse() && \"Reversed masked interleave-group \" \"not supported.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2056, __PRETTY_FUNCTION__))
;
2057 }
2058
2059 // If the group is reverse, adjust the index to refer to the last vector lane
2060 // instead of the first. We adjust the index from the first vector lane,
2061 // rather than directly getting the pointer for lane VF - 1, because the
2062 // pointer operand of the interleaved access is supposed to be uniform. For
2063 // uniform instructions, we're only required to generate a value for the
2064 // first vector lane in each unroll iteration.
2065 if (Group->isReverse())
2066 Index += (VF - 1) * Group->getFactor();
2067
2068 bool InBounds = false;
2069 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2070 InBounds = gep->isInBounds();
2071
2072 for (unsigned Part = 0; Part < UF; Part++) {
2073 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2074
2075 // Notice current instruction could be any index. Need to adjust the address
2076 // to the member of index 0.
2077 //
2078 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2079 // b = A[i]; // Member of index 0
2080 // Current pointer is pointed to A[i+1], adjust it to A[i].
2081 //
2082 // E.g. A[i+1] = a; // Member of index 1
2083 // A[i] = b; // Member of index 0
2084 // A[i+2] = c; // Member of index 2 (Current instruction)
2085 // Current pointer is pointed to A[i+2], adjust it to A[i].
2086 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2087 if (InBounds)
2088 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2089
2090 // Cast to the vector pointer type.
2091 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2092 }
2093
2094 setDebugLocFromInst(Builder, Instr);
2095 Value *UndefVec = UndefValue::get(VecTy);
2096
2097 Value *MaskForGaps = nullptr;
2098 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2099 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2100 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2100, __PRETTY_FUNCTION__))
;
2101 }
2102
2103 // Vectorize the interleaved load group.
2104 if (isa<LoadInst>(Instr)) {
2105 // For each unroll part, create a wide load for the group.
2106 SmallVector<Value *, 2> NewLoads;
2107 for (unsigned Part = 0; Part < UF; Part++) {
2108 Instruction *NewLoad;
2109 if (IsMaskForCondRequired || MaskForGaps) {
2110 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2111, __PRETTY_FUNCTION__))
2111 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2111, __PRETTY_FUNCTION__))
;
2112 Value *GroupMask = MaskForGaps;
2113 if (IsMaskForCondRequired) {
2114 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2115 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2116 Value *ShuffledMask = Builder.CreateShuffleVector(
2117 Mask[Part], Undefs, RepMask, "interleaved.mask");
2118 GroupMask = MaskForGaps
2119 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2120 MaskForGaps)
2121 : ShuffledMask;
2122 }
2123 NewLoad =
2124 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2125 GroupMask, UndefVec, "wide.masked.vec");
2126 }
2127 else
2128 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2129 Group->getAlignment(), "wide.vec");
2130 Group->addMetadata(NewLoad);
2131 NewLoads.push_back(NewLoad);
2132 }
2133
2134 // For each member in the group, shuffle out the appropriate data from the
2135 // wide loads.
2136 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2137 Instruction *Member = Group->getMember(I);
2138
2139 // Skip the gaps in the group.
2140 if (!Member)
2141 continue;
2142
2143 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2144 for (unsigned Part = 0; Part < UF; Part++) {
2145 Value *StridedVec = Builder.CreateShuffleVector(
2146 NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2147
2148 // If this member has different type, cast the result type.
2149 if (Member->getType() != ScalarTy) {
2150 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2151 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2152 }
2153
2154 if (Group->isReverse())
2155 StridedVec = reverseVector(StridedVec);
2156
2157 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2158 }
2159 }
2160 return;
2161 }
2162
2163 // The sub vector type for current instruction.
2164 VectorType *SubVT = VectorType::get(ScalarTy, VF);
2165
2166 // Vectorize the interleaved store group.
2167 for (unsigned Part = 0; Part < UF; Part++) {
2168 // Collect the stored vector from each member.
2169 SmallVector<Value *, 4> StoredVecs;
2170 for (unsigned i = 0; i < InterleaveFactor; i++) {
2171 // Interleaved store group doesn't allow a gap, so each index has a member
2172 Instruction *Member = Group->getMember(i);
2173 assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2173, __PRETTY_FUNCTION__))
;
2174
2175 Value *StoredVec = getOrCreateVectorValue(
2176 cast<StoreInst>(Member)->getValueOperand(), Part);
2177 if (Group->isReverse())
2178 StoredVec = reverseVector(StoredVec);
2179
2180 // If this member has different type, cast it to a unified type.
2181
2182 if (StoredVec->getType() != SubVT)
2183 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2184
2185 StoredVecs.push_back(StoredVec);
2186 }
2187
2188 // Concatenate all vectors into a wide vector.
2189 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2190
2191 // Interleave the elements in the wide vector.
2192 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2193 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2194 "interleaved.vec");
2195
2196 Instruction *NewStoreInstr;
2197 if (IsMaskForCondRequired) {
2198 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2199 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2200 Value *ShuffledMask = Builder.CreateShuffleVector(
2201 Mask[Part], Undefs, RepMask, "interleaved.mask");
2202 NewStoreInstr = Builder.CreateMaskedStore(
2203 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2204 }
2205 else
2206 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2207 Group->getAlignment());
2208
2209 Group->addMetadata(NewStoreInstr);
2210 }
2211}
2212
2213void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2214 VectorParts *BlockInMask) {
2215 // Attempt to issue a wide load.
2216 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2217 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2218
2219 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2219, __PRETTY_FUNCTION__))
;
2220
2221 LoopVectorizationCostModel::InstWidening Decision =
2222 Cost->getWideningDecision(Instr, VF);
2223 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2224, __PRETTY_FUNCTION__))
2224 "CM decision should be taken at this point")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2224, __PRETTY_FUNCTION__))
;
2225 if (Decision == LoopVectorizationCostModel::CM_Interleave)
2226 return vectorizeInterleaveGroup(Instr);
2227
2228 Type *ScalarDataTy = getMemInstValueType(Instr);
2229 Type *DataTy = VectorType::get(ScalarDataTy, VF);
2230 Value *Ptr = getLoadStorePointerOperand(Instr);
2231 unsigned Alignment = getLoadStoreAlignment(Instr);
2232 // An alignment of 0 means target abi alignment. We need to use the scalar's
2233 // target abi alignment in such a case.
2234 const DataLayout &DL = Instr->getModule()->getDataLayout();
2235 if (!Alignment)
2236 Alignment = DL.getABITypeAlignment(ScalarDataTy);
2237 unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2238
2239 // Determine if the pointer operand of the access is either consecutive or
2240 // reverse consecutive.
2241 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2242 bool ConsecutiveStride =
2243 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2244 bool CreateGatherScatter =
2245 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2246
2247 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2248 // gather/scatter. Otherwise Decision should have been to Scalarize.
2249 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2250, __PRETTY_FUNCTION__))
2250 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2250, __PRETTY_FUNCTION__))
;
2251
2252 // Handle consecutive loads/stores.
2253 if (ConsecutiveStride)
2254 Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2255
2256 VectorParts Mask;
2257 bool isMaskRequired = BlockInMask;
2258 if (isMaskRequired)
2259 Mask = *BlockInMask;
2260
2261 bool InBounds = false;
2262 if (auto *gep = dyn_cast<GetElementPtrInst>(
2263 getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2264 InBounds = gep->isInBounds();
2265
2266 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2267 // Calculate the pointer for the specific unroll-part.
2268 GetElementPtrInst *PartPtr = nullptr;
2269
2270 if (Reverse) {
2271 // If the address is consecutive but reversed, then the
2272 // wide store needs to start at the last vector element.
2273 PartPtr = cast<GetElementPtrInst>(
2274 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2275 PartPtr->setIsInBounds(InBounds);
2276 PartPtr = cast<GetElementPtrInst>(
2277 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2278 PartPtr->setIsInBounds(InBounds);
2279 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2280 Mask[Part] = reverseVector(Mask[Part]);
2281 } else {
2282 PartPtr = cast<GetElementPtrInst>(
2283 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2284 PartPtr->setIsInBounds(InBounds);
2285 }
2286
2287 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2288 };
2289
2290 // Handle Stores:
2291 if (SI) {
2292 setDebugLocFromInst(Builder, SI);
2293
2294 for (unsigned Part = 0; Part < UF; ++Part) {
2295 Instruction *NewSI = nullptr;
2296 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2297 if (CreateGatherScatter) {
2298 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2299 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2300 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2301 MaskPart);
2302 } else {
2303 if (Reverse) {
2304 // If we store to reverse consecutive memory locations, then we need
2305 // to reverse the order of elements in the stored value.
2306 StoredVal = reverseVector(StoredVal);
2307 // We don't want to update the value in the map as it might be used in
2308 // another expression. So don't call resetVectorValue(StoredVal).
2309 }
2310 auto *VecPtr = CreateVecPtr(Part, Ptr);
2311 if (isMaskRequired)
2312 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2313 Mask[Part]);
2314 else
2315 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2316 }
2317 addMetadata(NewSI, SI);
2318 }
2319 return;
2320 }
2321
2322 // Handle loads.
2323 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2323, __PRETTY_FUNCTION__))
;
2324 setDebugLocFromInst(Builder, LI);
2325 for (unsigned Part = 0; Part < UF; ++Part) {
2326 Value *NewLI;
2327 if (CreateGatherScatter) {
2328 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2329 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2330 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2331 nullptr, "wide.masked.gather");
2332 addMetadata(NewLI, LI);
2333 } else {
2334 auto *VecPtr = CreateVecPtr(Part, Ptr);
2335 if (isMaskRequired)
2336 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2337 UndefValue::get(DataTy),
2338 "wide.masked.load");
2339 else
2340 NewLI =
2341 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2342
2343 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2344 addMetadata(NewLI, LI);
2345 if (Reverse)
2346 NewLI = reverseVector(NewLI);
2347 }
2348 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2349 }
2350}
2351
2352void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2353 const VPIteration &Instance,
2354 bool IfPredicateInstr) {
2355 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2355, __PRETTY_FUNCTION__))
;
2356
2357 setDebugLocFromInst(Builder, Instr);
2358
2359 // Does this instruction return a value ?
2360 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2361
2362 Instruction *Cloned = Instr->clone();
2363 if (!IsVoidRetTy)
2364 Cloned->setName(Instr->getName() + ".cloned");
2365
2366 // Replace the operands of the cloned instructions with their scalar
2367 // equivalents in the new loop.
2368 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2369 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2370 Cloned->setOperand(op, NewOp);
2371 }
2372 addNewMetadata(Cloned, Instr);
2373
2374 // Place the cloned scalar in the new loop.
2375 Builder.Insert(Cloned);
2376
2377 // Add the cloned scalar to the scalar map entry.
2378 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2379
2380 // If we just cloned a new assumption, add it the assumption cache.
2381 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2382 if (II->getIntrinsicID() == Intrinsic::assume)
2383 AC->registerAssumption(II);
2384
2385 // End if-block.
2386 if (IfPredicateInstr)
2387 PredicatedInstructions.push_back(Cloned);
2388}
2389
2390PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2391 Value *End, Value *Step,
2392 Instruction *DL) {
2393 BasicBlock *Header = L->getHeader();
2394 BasicBlock *Latch = L->getLoopLatch();
2395 // As we're just creating this loop, it's possible no latch exists
2396 // yet. If so, use the header as this will be a single block loop.
2397 if (!Latch)
2398 Latch = Header;
2399
2400 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2401 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2402 setDebugLocFromInst(Builder, OldInst);
2403 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2404
2405 Builder.SetInsertPoint(Latch->getTerminator());
2406 setDebugLocFromInst(Builder, OldInst);
2407
2408 // Create i+1 and fill the PHINode.
2409 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2410 Induction->addIncoming(Start, L->getLoopPreheader());
2411 Induction->addIncoming(Next, Latch);
2412 // Create the compare.
2413 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2414 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2415
2416 // Now we have two terminators. Remove the old one from the block.
2417 Latch->getTerminator()->eraseFromParent();
2418
2419 return Induction;
2420}
2421
2422Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2423 if (TripCount)
2424 return TripCount;
2425
2426 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2426, __PRETTY_FUNCTION__))
;
2427 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2428 // Find the loop boundaries.
2429 ScalarEvolution *SE = PSE.getSE();
2430 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2431 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2432, __PRETTY_FUNCTION__))
2432 "Invalid loop count")((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2432, __PRETTY_FUNCTION__))
;
2433
2434 Type *IdxTy = Legal->getWidestInductionType();
2435 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2435, __PRETTY_FUNCTION__))
;
2436
2437 // The exit count might have the type of i64 while the phi is i32. This can
2438 // happen if we have an induction variable that is sign extended before the
2439 // compare. The only way that we get a backedge taken count is that the
2440 // induction variable was signed and as such will not overflow. In such a case
2441 // truncation is legal.
2442 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2443 IdxTy->getPrimitiveSizeInBits())
2444 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2445 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2446
2447 // Get the total trip count from the count by adding 1.
2448 const SCEV *ExitCount = SE->getAddExpr(
2449 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2450
2451 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2452
2453 // Expand the trip count and place the new instructions in the preheader.
2454 // Notice that the pre-header does not change, only the loop body.
2455 SCEVExpander Exp(*SE, DL, "induction");
2456
2457 // Count holds the overall loop count (N).
2458 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2459 L->getLoopPreheader()->getTerminator());
2460
2461 if (TripCount->getType()->isPointerTy())
2462 TripCount =
2463 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2464 L->getLoopPreheader()->getTerminator());
2465
2466 return TripCount;
2467}
2468
2469Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2470 if (VectorTripCount)
2471 return VectorTripCount;
2472
2473 Value *TC = getOrCreateTripCount(L);
2474 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2475
2476 Type *Ty = TC->getType();
2477 Constant *Step = ConstantInt::get(Ty, VF * UF);
2478
2479 // If the tail is to be folded by masking, round the number of iterations N
2480 // up to a multiple of Step instead of rounding down. This is done by first
2481 // adding Step-1 and then rounding down. Note that it's ok if this addition
2482 // overflows: the vector induction variable will eventually wrap to zero given
2483 // that it starts at zero and its Step is a power of two; the loop will then
2484 // exit, with the last early-exit vector comparison also producing all-true.
2485 if (Cost->foldTailByMasking()) {
2486 assert(isPowerOf2_32(VF * UF) &&((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2487, __PRETTY_FUNCTION__))
2487 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2487, __PRETTY_FUNCTION__))
;
2488 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2489 }
2490
2491 // Now we need to generate the expression for the part of the loop that the
2492 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2493 // iterations are not required for correctness, or N - Step, otherwise. Step
2494 // is equal to the vectorization factor (number of SIMD elements) times the
2495 // unroll factor (number of SIMD instructions).
2496 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2497
2498 // If there is a non-reversed interleaved group that may speculatively access
2499 // memory out-of-bounds, we need to ensure that there will be at least one
2500 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2501 // the trip count, we set the remainder to be equal to the step. If the step
2502 // does not evenly divide the trip count, no adjustment is necessary since
2503 // there will already be scalar iterations. Note that the minimum iterations
2504 // check ensures that N >= Step.
2505 if (VF > 1 && Cost->requiresScalarEpilogue()) {
2506 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2507 R = Builder.CreateSelect(IsZero, Step, R);
2508 }
2509
2510 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2511
2512 return VectorTripCount;
2513}
2514
2515Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2516 const DataLayout &DL) {
2517 // Verify that V is a vector type with same number of elements as DstVTy.
2518 unsigned VF = DstVTy->getNumElements();
2519 VectorType *SrcVecTy = cast<VectorType>(V->getType());
2520 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2520, __PRETTY_FUNCTION__))
;
2521 Type *SrcElemTy = SrcVecTy->getElementType();
2522 Type *DstElemTy = DstVTy->getElementType();
2523 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2524, __PRETTY_FUNCTION__))
2524 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2524, __PRETTY_FUNCTION__))
;
2525
2526 // Do a direct cast if element types are castable.
2527 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2528 return Builder.CreateBitOrPointerCast(V, DstVTy);
2529 }
2530 // V cannot be directly casted to desired vector type.
2531 // May happen when V is a floating point vector but DstVTy is a vector of
2532 // pointers or vice-versa. Handle this using a two-step bitcast using an
2533 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2534 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2535, __PRETTY_FUNCTION__))
2535 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2535, __PRETTY_FUNCTION__))
;
2536 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2537, __PRETTY_FUNCTION__))
2537 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2537, __PRETTY_FUNCTION__))
;
2538 Type *IntTy =
2539 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2540 VectorType *VecIntTy = VectorType::get(IntTy, VF);
2541 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2542 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2543}
2544
2545void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2546 BasicBlock *Bypass) {
2547 Value *Count = getOrCreateTripCount(L);
2548 BasicBlock *BB = L->getLoopPreheader();
2549 IRBuilder<> Builder(BB->getTerminator());
2550
2551 // Generate code to check if the loop's trip count is less than VF * UF, or
2552 // equal to it in case a scalar epilogue is required; this implies that the
2553 // vector trip count is zero. This check also covers the case where adding one
2554 // to the backedge-taken count overflowed leading to an incorrect trip count
2555 // of zero. In this case we will also jump to the scalar loop.
2556 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2557 : ICmpInst::ICMP_ULT;
2558
2559 // If tail is to be folded, vector loop takes care of all iterations.
2560 Value *CheckMinIters = Builder.getFalse();
2561 if (!Cost->foldTailByMasking())
2562 CheckMinIters = Builder.CreateICmp(
2563 P, Count, ConstantInt::get(Count->getType(), VF * UF),
2564 "min.iters.check");
2565
2566 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2567 // Update dominator tree immediately if the generated block is a
2568 // LoopBypassBlock because SCEV expansions to generate loop bypass
2569 // checks may query it before the current function is finished.
2570 DT->addNewBlock(NewBB, BB);
2571 if (L->getParentLoop())
2572 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2573 ReplaceInstWithInst(BB->getTerminator(),
2574 BranchInst::Create(Bypass, NewBB, CheckMinIters));
2575 LoopBypassBlocks.push_back(BB);
2576}
2577
2578void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2579 BasicBlock *BB = L->getLoopPreheader();
2580
2581 // Generate the code to check that the SCEV assumptions that we made.
2582 // We want the new basic block to start at the first instruction in a
2583 // sequence of instructions that form a check.
2584 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2585 "scev.check");
2586 Value *SCEVCheck =
2587 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2588
2589 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2590 if (C->isZero())
2591 return;
2592
2593 assert(!Cost->foldTailByMasking() &&((!Cost->foldTailByMasking() && "Cannot SCEV check stride or overflow when folding tail"
) ? static_cast<void> (0) : __assert_fail ("!Cost->foldTailByMasking() && \"Cannot SCEV check stride or overflow when folding tail\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2594, __PRETTY_FUNCTION__))
2594 "Cannot SCEV check stride or overflow when folding tail")((!Cost->foldTailByMasking() && "Cannot SCEV check stride or overflow when folding tail"
) ? static_cast<void> (0) : __assert_fail ("!Cost->foldTailByMasking() && \"Cannot SCEV check stride or overflow when folding tail\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2594, __PRETTY_FUNCTION__))
;
2595 // Create a new block containing the stride check.
2596 BB->setName("vector.scevcheck");
2597 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2598 // Update dominator tree immediately if the generated block is a
2599 // LoopBypassBlock because SCEV expansions to generate loop bypass
2600 // checks may query it before the current function is finished.
2601 DT->addNewBlock(NewBB, BB);
2602 if (L->getParentLoop())
2603 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2604 ReplaceInstWithInst(BB->getTerminator(),
2605 BranchInst::Create(Bypass, NewBB, SCEVCheck));
2606 LoopBypassBlocks.push_back(BB);
2607 AddedSafetyChecks = true;
2608}
2609
2610void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2611 // VPlan-native path does not do any analysis for runtime checks currently.
2612 if (EnableVPlanNativePath)
2613 return;
2614
2615 BasicBlock *BB = L->getLoopPreheader();
2616
2617 // Generate the code that checks in runtime if arrays overlap. We put the
2618 // checks into a separate block to make the more common case of few elements
2619 // faster.
2620 Instruction *FirstCheckInst;
2621 Instruction *MemRuntimeCheck;
2622 std::tie(FirstCheckInst, MemRuntimeCheck) =
2623 Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2624 if (!MemRuntimeCheck)
2625 return;
2626
2627 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail")((!Cost->foldTailByMasking() && "Cannot check memory when folding tail"
) ? static_cast<void> (0) : __assert_fail ("!Cost->foldTailByMasking() && \"Cannot check memory when folding tail\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2627, __PRETTY_FUNCTION__))
;
2628 // Create a new block containing the memory check.
2629 BB->setName("vector.memcheck");
2630 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2631 // Update dominator tree immediately if the generated block is a
2632 // LoopBypassBlock because SCEV expansions to generate loop bypass
2633 // checks may query it before the current function is finished.
2634 DT->addNewBlock(NewBB, BB);
2635 if (L->getParentLoop())
2636 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2637 ReplaceInstWithInst(BB->getTerminator(),
2638 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2639 LoopBypassBlocks.push_back(BB);
2640 AddedSafetyChecks = true;
2641
2642 // We currently don't use LoopVersioning for the actual loop cloning but we
2643 // still use it to add the noalias metadata.
2644 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2645 PSE.getSE());
2646 LVer->prepareNoAliasMetadata();
2647}
2648
2649Value *InnerLoopVectorizer::emitTransformedIndex(
2650 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2651 const InductionDescriptor &ID) const {
2652
2653 SCEVExpander Exp(*SE, DL, "induction");
2654 auto Step = ID.getStep();
2655 auto StartValue = ID.getStartValue();
2656 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2657, __PRETTY_FUNCTION__))
2657 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2657, __PRETTY_FUNCTION__))
;
2658
2659 // Note: the IR at this point is broken. We cannot use SE to create any new
2660 // SCEV and then expand it, hoping that SCEV's simplification will give us
2661 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2662 // lead to various SCEV crashes. So all we can do is to use builder and rely
2663 // on InstCombine for future simplifications. Here we handle some trivial
2664 // cases only.
2665 auto CreateAdd = [&B](Value *X, Value *Y) {
2666 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2666, __PRETTY_FUNCTION__))
;
2667 if (auto *CX = dyn_cast<ConstantInt>(X))
2668 if (CX->isZero())
2669 return Y;
2670 if (auto *CY = dyn_cast<ConstantInt>(Y))
2671 if (CY->isZero())
2672 return X;
2673 return B.CreateAdd(X, Y);
2674 };
2675
2676 auto CreateMul = [&B](Value *X, Value *Y) {
2677 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2677, __PRETTY_FUNCTION__))
;
2678 if (auto *CX = dyn_cast<ConstantInt>(X))
2679 if (CX->isOne())
2680 return Y;
2681 if (auto *CY = dyn_cast<ConstantInt>(Y))
2682 if (CY->isOne())
2683 return X;
2684 return B.CreateMul(X, Y);
2685 };
2686
2687 switch (ID.getKind()) {
2688 case InductionDescriptor::IK_IntInduction: {
2689 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2690, __PRETTY_FUNCTION__))
2690 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2690, __PRETTY_FUNCTION__))
;
2691 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2692 return B.CreateSub(StartValue, Index);
2693 auto *Offset = CreateMul(
2694 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2695 return CreateAdd(StartValue, Offset);
2696 }
2697 case InductionDescriptor::IK_PtrInduction: {
2698 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2699, __PRETTY_FUNCTION__))
2699 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2699, __PRETTY_FUNCTION__))
;
2700 return B.CreateGEP(
2701 StartValue->getType()->getPointerElementType(), StartValue,
2702 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2703 &*B.GetInsertPoint())));
2704 }
2705 case InductionDescriptor::IK_FpInduction: {
2706 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
;
2707 auto InductionBinOp = ID.getInductionBinOp();
2708 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2711, __PRETTY_FUNCTION__))
2709 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2711, __PRETTY_FUNCTION__))
2710 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2711, __PRETTY_FUNCTION__))
2711 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2711, __PRETTY_FUNCTION__))
;
2712
2713 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2714
2715 // Floating point operations had to be 'fast' to enable the induction.
2716 FastMathFlags Flags;
2717 Flags.setFast();
2718
2719 Value *MulExp = B.CreateFMul(StepValue, Index);
2720 if (isa<Instruction>(MulExp))
2721 // We have to check, the MulExp may be a constant.
2722 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2723
2724 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2725 "induction");
2726 if (isa<Instruction>(BOp))
2727 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2728
2729 return BOp;
2730 }
2731 case InductionDescriptor::IK_NoInduction:
2732 return nullptr;
2733 }
2734 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2734)
;
2735}
2736
2737BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2738 /*
2739 In this function we generate a new loop. The new loop will contain
2740 the vectorized instructions while the old loop will continue to run the
2741 scalar remainder.
2742
2743 [ ] <-- loop iteration number check.
2744 / |
2745 / v
2746 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2747 | / |
2748 | / v
2749 || [ ] <-- vector pre header.
2750 |/ |
2751 | v
2752 | [ ] \
2753 | [ ]_| <-- vector loop.
2754 | |
2755 | v
2756 | -[ ] <--- middle-block.
2757 | / |
2758 | / v
2759 -|- >[ ] <--- new preheader.
2760 | |
2761 | v
2762 | [ ] \
2763 | [ ]_| <-- old scalar loop to handle remainder.
2764 \ |
2765 \ v
2766 >[ ] <-- exit block.
2767 ...
2768 */
2769
2770 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2771 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2772 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2773 MDNode *OrigLoopID = OrigLoop->getLoopID();
2774 assert(VectorPH && "Invalid loop structure")((VectorPH && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("VectorPH && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2774, __PRETTY_FUNCTION__))
;
2775 assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2775, __PRETTY_FUNCTION__))
;
2776
2777 // Some loops have a single integer induction variable, while other loops
2778 // don't. One example is c++ iterators that often have multiple pointer
2779 // induction variables. In the code below we also support a case where we
2780 // don't have a single induction variable.
2781 //
2782 // We try to obtain an induction variable from the original loop as hard
2783 // as possible. However if we don't find one that:
2784 // - is an integer
2785 // - counts from zero, stepping by one
2786 // - is the size of the widest induction variable type
2787 // then we create a new one.
2788 OldInduction = Legal->getPrimaryInduction();
2789 Type *IdxTy = Legal->getWidestInductionType();
2790
2791 // Split the single block loop into the two loop structure described above.
2792 BasicBlock *VecBody =
2793 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2794 BasicBlock *MiddleBlock =
2795 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2796 BasicBlock *ScalarPH =
2797 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2798
2799 // Create and register the new vector loop.
2800 Loop *Lp = LI->AllocateLoop();
2801 Loop *ParentLoop = OrigLoop->getParentLoop();
2802
2803 // Insert the new loop into the loop nest and register the new basic blocks
2804 // before calling any utilities such as SCEV that require valid LoopInfo.
2805 if (ParentLoop) {
2806 ParentLoop->addChildLoop(Lp);
2807 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2808 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2809 } else {
2810 LI->addTopLevelLoop(Lp);
2811 }
2812 Lp->addBasicBlockToLoop(VecBody, *LI);
2813
2814 // Find the loop boundaries.
2815 Value *Count = getOrCreateTripCount(Lp);
2816
2817 Value *StartIdx = ConstantInt::get(IdxTy, 0);
2818
2819 // Now, compare the new count to zero. If it is zero skip the vector loop and
2820 // jump to the scalar loop. This check also covers the case where the
2821 // backedge-taken count is uint##_max: adding one to it will overflow leading
2822 // to an incorrect trip count of zero. In this (rare) case we will also jump
2823 // to the scalar loop.
2824 emitMinimumIterationCountCheck(Lp, ScalarPH);
2825
2826 // Generate the code to check any assumptions that we've made for SCEV
2827 // expressions.
2828 emitSCEVChecks(Lp, ScalarPH);
2829
2830 // Generate the code that checks in runtime if arrays overlap. We put the
2831 // checks into a separate block to make the more common case of few elements
2832 // faster.
2833 emitMemRuntimeChecks(Lp, ScalarPH);
2834
2835 // Generate the induction variable.
2836 // The loop step is equal to the vectorization factor (num of SIMD elements)
2837 // times the unroll factor (num of SIMD instructions).
2838 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2839 Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2840 Induction =
2841 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2842 getDebugLocFromInstOrOperands(OldInduction));
2843
2844 // We are going to resume the execution of the scalar loop.
2845 // Go over all of the induction variables that we found and fix the
2846 // PHIs that are left in the scalar version of the loop.
2847 // The starting values of PHI nodes depend on the counter of the last
2848 // iteration in the vectorized loop.
2849 // If we come from a bypass edge then we need to start from the original
2850 // start value.
2851
2852 // This variable saves the new starting index for the scalar loop. It is used
2853 // to test if there are any tail iterations left once the vector loop has
2854 // completed.
2855 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2856 for (auto &InductionEntry : *List) {
2857 PHINode *OrigPhi = InductionEntry.first;
2858 InductionDescriptor II = InductionEntry.second;
2859
2860 // Create phi nodes to merge from the backedge-taken check block.
2861 PHINode *BCResumeVal = PHINode::Create(
2862 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2863 // Copy original phi DL over to the new one.
2864 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2865 Value *&EndValue = IVEndValues[OrigPhi];
2866 if (OrigPhi == OldInduction) {
2867 // We know what the end value is.
2868 EndValue = CountRoundDown;
2869 } else {
2870 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2871 Type *StepType = II.getStep()->getType();
2872 Instruction::CastOps CastOp =
2873 CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2874 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2875 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2876 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2877 EndValue->setName("ind.end");
2878 }
2879
2880 // The new PHI merges the original incoming value, in case of a bypass,
2881 // or the value at the end of the vectorized loop.
2882 BCResumeVal->addIncoming(EndValue, MiddleBlock);
2883
2884 // Fix the scalar body counter (PHI node).
2885 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2886
2887 // The old induction's phi node in the scalar body needs the truncated
2888 // value.
2889 for (BasicBlock *BB : LoopBypassBlocks)
2890 BCResumeVal->addIncoming(II.getStartValue(), BB);
2891 OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2892 }
2893
2894 // We need the OrigLoop (scalar loop part) latch terminator to help
2895 // produce correct debug info for the middle block BB instructions.
2896 // The legality check stage guarantees that the loop will have a single
2897 // latch.
2898 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2899, __PRETTY_FUNCTION__))
2899 "Scalar loop latch terminator isn't a branch")((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2899, __PRETTY_FUNCTION__))
;
2900 BranchInst *ScalarLatchBr =
2901 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2902
2903 // Add a check in the middle block to see if we have completed
2904 // all of the iterations in the first vector loop.
2905 // If (N - N%VF) == N, then we *don't* need to run the remainder.
2906 // If tail is to be folded, we know we don't need to run the remainder.
2907 Value *CmpN = Builder.getTrue();
2908 if (!Cost->foldTailByMasking()) {
2909 CmpN =
2910 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2911 CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2912
2913 // Provide correct stepping behaviour by using the same DebugLoc as the
2914 // scalar loop latch branch cmp if it exists.
2915 if (CmpInst *ScalarLatchCmp =
2916 dyn_cast_or_null<CmpInst>(ScalarLatchBr->getCondition()))
2917 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchCmp->getDebugLoc());
2918 }
2919
2920 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
2921 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
2922 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
2923
2924 // Get ready to start creating new instructions into the vectorized body.
2925 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
2926
2927 // Save the state.
2928 LoopVectorPreHeader = Lp->getLoopPreheader();
2929 LoopScalarPreHeader = ScalarPH;
2930 LoopMiddleBlock = MiddleBlock;
2931 LoopExitBlock = ExitBlock;
2932 LoopVectorBody = VecBody;
2933 LoopScalarBody = OldBasicBlock;
2934
2935 Optional<MDNode *> VectorizedLoopID =
2936 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
2937 LLVMLoopVectorizeFollowupVectorized});
2938 if (VectorizedLoopID.hasValue()) {
2939 Lp->setLoopID(VectorizedLoopID.getValue());
2940
2941 // Do not setAlreadyVectorized if loop attributes have been defined
2942 // explicitly.
2943 return LoopVectorPreHeader;
2944 }
2945
2946 // Keep all loop hints from the original loop on the vector loop (we'll
2947 // replace the vectorizer-specific hints below).
2948 if (MDNode *LID = OrigLoop->getLoopID())
2949 Lp->setLoopID(LID);
2950
2951 LoopVectorizeHints Hints(Lp, true, *ORE);
2952 Hints.setAlreadyVectorized();
2953
2954 return LoopVectorPreHeader;
2955}
2956
2957// Fix up external users of the induction variable. At this point, we are
2958// in LCSSA form, with all external PHIs that use the IV having one input value,
2959// coming from the remainder loop. We need those PHIs to also have a correct
2960// value for the IV when arriving directly from the middle block.
2961void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2962 const InductionDescriptor &II,
2963 Value *CountRoundDown, Value *EndValue,
2964 BasicBlock *MiddleBlock) {
2965 // There are two kinds of external IV usages - those that use the value
2966 // computed in the last iteration (the PHI) and those that use the penultimate
2967 // value (the value that feeds into the phi from the loop latch).
2968 // We allow both, but they, obviously, have different values.
2969
2970 assert(OrigLoop->getExitBlock() && "Expected a single exit block")((OrigLoop->getExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2970, __PRETTY_FUNCTION__))
;
2971
2972 DenseMap<Value *, Value *> MissingVals;
2973
2974 // An external user of the last iteration's value should see the value that
2975 // the remainder loop uses to initialize its own IV.
2976 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2977 for (User *U : PostInc->users()) {
2978 Instruction *UI = cast<Instruction>(U);
2979 if (!OrigLoop->contains(UI)) {
2980 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2980, __PRETTY_FUNCTION__))
;
2981 MissingVals[UI] = EndValue;
2982 }
2983 }
2984
2985 // An external user of the penultimate value need to see EndValue - Step.
2986 // The simplest way to get this is to recompute it from the constituent SCEVs,
2987 // that is Start + (Step * (CRD - 1)).
2988 for (User *U : OrigPhi->users()) {
2989 auto *UI = cast<Instruction>(U);
2990 if (!OrigLoop->contains(UI)) {
2991 const DataLayout &DL =
2992 OrigLoop->getHeader()->getModule()->getDataLayout();
2993 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2993, __PRETTY_FUNCTION__))
;
2994
2995 IRBuilder<> B(MiddleBlock->getTerminator());
2996 Value *CountMinusOne = B.CreateSub(
2997 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2998 Value *CMO =
2999 !II.getStep()->getType()->isIntegerTy()
3000 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3001 II.getStep()->getType())
3002 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3003 CMO->setName("cast.cmo");
3004 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3005 Escape->setName("ind.escape");
3006 MissingVals[UI] = Escape;
3007 }
3008 }
3009
3010 for (auto &I : MissingVals) {
3011 PHINode *PHI = cast<PHINode>(I.first);
3012 // One corner case we have to handle is two IVs "chasing" each-other,
3013 // that is %IV2 = phi [...], [ %IV1, %latch ]
3014 // In this case, if IV1 has an external use, we need to avoid adding both
3015 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3016 // don't already have an incoming value for the middle block.
3017 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3018 PHI->addIncoming(I.second, MiddleBlock);
3019 }
3020}
3021
3022namespace {
3023
3024struct CSEDenseMapInfo {
3025 static bool canHandle(const Instruction *I) {
3026 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3027 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3028 }
3029
3030 static inline Instruction *getEmptyKey() {
3031 return DenseMapInfo<Instruction *>::getEmptyKey();
3032 }
3033
3034 static inline Instruction *getTombstoneKey() {
3035 return DenseMapInfo<Instruction *>::getTombstoneKey();
3036 }
3037
3038 static unsigned getHashValue(const Instruction *I) {
3039 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3039, __PRETTY_FUNCTION__))
;
3040 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3041 I->value_op_end()));
3042 }
3043
3044 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3045 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3046 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3047 return LHS == RHS;
3048 return LHS->isIdenticalTo(RHS);
3049 }
3050};
3051
3052} // end anonymous namespace
3053
3054///Perform cse of induction variable instructions.
3055static void cse(BasicBlock *BB) {
3056 // Perform simple cse.
3057 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3058 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3059 Instruction *In = &*I++;
3060
3061 if (!CSEDenseMapInfo::canHandle(In))
3062 continue;
3063
3064 // Check if we can replace this instruction with any of the
3065 // visited instructions.
3066 if (Instruction *V = CSEMap.lookup(In)) {
3067 In->replaceAllUsesWith(V);
3068 In->eraseFromParent();
3069 continue;
3070 }
3071
3072 CSEMap[In] = In;
3073 }
3074}
3075
3076unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3077 unsigned VF,
3078 bool &NeedToScalarize) {
3079 Function *F = CI->getCalledFunction();
3080 StringRef FnName = CI->getCalledFunction()->getName();
3081 Type *ScalarRetTy = CI->getType();
3082 SmallVector<Type *, 4> Tys, ScalarTys;
3083 for (auto &ArgOp : CI->arg_operands())
3084 ScalarTys.push_back(ArgOp->getType());
3085
3086 // Estimate cost of scalarized vector call. The source operands are assumed
3087 // to be vectors, so we need to extract individual elements from there,
3088 // execute VF scalar calls, and then gather the result into the vector return
3089 // value.
3090 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3091 if (VF == 1)
3092 return ScalarCallCost;
3093
3094 // Compute corresponding vector type for return value and arguments.
3095 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3096 for (Type *ScalarTy : ScalarTys)
3097 Tys.push_back(ToVectorTy(ScalarTy, VF));
3098
3099 // Compute costs of unpacking argument values for the scalar calls and
3100 // packing the return values to a vector.
3101 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3102
3103 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3104
3105 // If we can't emit a vector call for this function, then the currently found
3106 // cost is the cost we need to return.
3107 NeedToScalarize = true;
3108 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3109 return Cost;
3110
3111 // If the corresponding vector cost is cheaper, return its cost.
3112 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3113 if (VectorCallCost < Cost) {
3114 NeedToScalarize = false;
3115 return VectorCallCost;
3116 }
3117 return Cost;
3118}
3119
3120unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3121 unsigned VF) {
3122 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3123 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3123, __PRETTY_FUNCTION__))
;
3124
3125 FastMathFlags FMF;
3126 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3127 FMF = FPMO->getFastMathFlags();
3128
3129 SmallVector<Value *, 4> Operands(CI->arg_operands());
3130 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3131}
3132
3133static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3134 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3135 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3136 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3137}
3138static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3139 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3140 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3141 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3142}
3143
3144void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3145 // For every instruction `I` in MinBWs, truncate the operands, create a
3146 // truncated version of `I` and reextend its result. InstCombine runs
3147 // later and will remove any ext/trunc pairs.
3148 SmallPtrSet<Value *, 4> Erased;
3149 for (const auto &KV : Cost->getMinimalBitwidths()) {
3150 // If the value wasn't vectorized, we must maintain the original scalar
3151 // type. The absence of the value from VectorLoopValueMap indicates that it
3152 // wasn't vectorized.
3153 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3154 continue;
3155 for (unsigned Part = 0; Part < UF; ++Part) {
3156 Value *I = getOrCreateVectorValue(KV.first, Part);
3157 if (Erased.find(I) != Erased.end() || I->use_empty() ||
3158 !isa<Instruction>(I))
3159 continue;
3160 Type *OriginalTy = I->getType();
3161 Type *ScalarTruncatedTy =
3162 IntegerType::get(OriginalTy->getContext(), KV.second);
3163 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3164 OriginalTy->getVectorNumElements());
3165 if (TruncatedTy == OriginalTy)
3166 continue;
3167
3168 IRBuilder<> B(cast<Instruction>(I));
3169 auto ShrinkOperand = [&](Value *V) -> Value * {
3170 if (auto *ZI = dyn_cast<ZExtInst>(V))
3171 if (ZI->getSrcTy() == TruncatedTy)
3172 return ZI->getOperand(0);
3173 return B.CreateZExtOrTrunc(V, TruncatedTy);
3174 };
3175
3176 // The actual instruction modification depends on the instruction type,
3177 // unfortunately.
3178 Value *NewI = nullptr;
3179 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3180 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3181 ShrinkOperand(BO->getOperand(1)));
3182
3183 // Any wrapping introduced by shrinking this operation shouldn't be
3184 // considered undefined behavior. So, we can't unconditionally copy
3185 // arithmetic wrapping flags to NewI.
3186 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3187 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3188 NewI =
3189 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3190 ShrinkOperand(CI->getOperand(1)));
3191 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3192 NewI = B.CreateSelect(SI->getCondition(),
3193 ShrinkOperand(SI->getTrueValue()),
3194 ShrinkOperand(SI->getFalseValue()));
3195 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3196 switch (CI->getOpcode()) {
3197 default:
3198 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3198)
;
3199 case Instruction::Trunc:
3200 NewI = ShrinkOperand(CI->getOperand(0));
3201 break;
3202 case Instruction::SExt:
3203 NewI = B.CreateSExtOrTrunc(
3204 CI->getOperand(0),
3205 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3206 break;
3207 case Instruction::ZExt:
3208 NewI = B.CreateZExtOrTrunc(
3209 CI->getOperand(0),
3210 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3211 break;
3212 }
3213 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3214 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3215 auto *O0 = B.CreateZExtOrTrunc(
3216 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3217 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3218 auto *O1 = B.CreateZExtOrTrunc(
3219 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3220
3221 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3222 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3223 // Don't do anything with the operands, just extend the result.
3224 continue;
3225 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3226 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3227 auto *O0 = B.CreateZExtOrTrunc(
3228 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3229 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3230 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3231 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3232 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3233 auto *O0 = B.CreateZExtOrTrunc(
3234 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3235 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3236 } else {
3237 // If we don't know what to do, be conservative and don't do anything.
3238 continue;
3239 }
3240
3241 // Lastly, extend the result.
3242 NewI->takeName(cast<Instruction>(I));
3243 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3244 I->replaceAllUsesWith(Res);
3245 cast<Instruction>(I)->eraseFromParent();
3246 Erased.insert(I);
3247 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3248 }
3249 }
3250
3251 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3252 for (const auto &KV : Cost->getMinimalBitwidths()) {
3253 // If the value wasn't vectorized, we must maintain the original scalar
3254 // type. The absence of the value from VectorLoopValueMap indicates that it
3255 // wasn't vectorized.
3256 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3257 continue;
3258 for (unsigned Part = 0; Part < UF; ++Part) {
3259 Value *I = getOrCreateVectorValue(KV.first, Part);
3260 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3261 if (Inst && Inst->use_empty()) {
3262 Value *NewI = Inst->getOperand(0);
3263 Inst->eraseFromParent();
3264 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3265 }
3266 }
3267 }
3268}
3269
3270void InnerLoopVectorizer::fixVectorizedLoop() {
3271 // Insert truncates and extends for any truncated instructions as hints to
3272 // InstCombine.
3273 if (VF > 1)
3274 truncateToMinimalBitwidths();
3275
3276 // Fix widened non-induction PHIs by setting up the PHI operands.
3277 if (OrigPHIsToFix.size()) {
3278 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3279, __PRETTY_FUNCTION__))
3279 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3279, __PRETTY_FUNCTION__))
;
3280 fixNonInductionPHIs();
3281 }
3282
3283 // At this point every instruction in the original loop is widened to a
3284 // vector form. Now we need to fix the recurrences in the loop. These PHI
3285 // nodes are currently empty because we did not want to introduce cycles.
3286 // This is the second stage of vectorizing recurrences.
3287 fixCrossIterationPHIs();
3288
3289 // Update the dominator tree.
3290 //
3291 // FIXME: After creating the structure of the new loop, the dominator tree is
3292 // no longer up-to-date, and it remains that way until we update it
3293 // here. An out-of-date dominator tree is problematic for SCEV,
3294 // because SCEVExpander uses it to guide code generation. The
3295 // vectorizer use SCEVExpanders in several places. Instead, we should
3296 // keep the dominator tree up-to-date as we go.
3297 updateAnalysis();
3298
3299 // Fix-up external users of the induction variables.
3300 for (auto &Entry : *Legal->getInductionVars())
3301 fixupIVUsers(Entry.first, Entry.second,
3302 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3303 IVEndValues[Entry.first], LoopMiddleBlock);
3304
3305 fixLCSSAPHIs();
3306 for (Instruction *PI : PredicatedInstructions)
3307 sinkScalarOperands(&*PI);
3308
3309 // Remove redundant induction instructions.
3310 cse(LoopVectorBody);
3311}
3312
3313void InnerLoopVectorizer::fixCrossIterationPHIs() {
3314 // In order to support recurrences we need to be able to vectorize Phi nodes.
3315 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3316 // stage #2: We now need to fix the recurrences by adding incoming edges to
3317 // the currently empty PHI nodes. At this point every instruction in the
3318 // original loop is widened to a vector form so we can use them to construct
3319 // the incoming edges.
3320 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3321 // Handle first-order recurrences and reductions that need to be fixed.
3322 if (Legal->isFirstOrderRecurrence(&Phi))
3323 fixFirstOrderRecurrence(&Phi);
3324 else if (Legal->isReductionVariable(&Phi))
3325 fixReduction(&Phi);
3326 }
3327}
3328
3329void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3330 // This is the second phase of vectorizing first-order recurrences. An
3331 // overview of the transformation is described below. Suppose we have the
3332 // following loop.
3333 //
3334 // for (int i = 0; i < n; ++i)
3335 // b[i] = a[i] - a[i - 1];
3336 //
3337 // There is a first-order recurrence on "a". For this loop, the shorthand
3338 // scalar IR looks like:
3339 //
3340 // scalar.ph:
3341 // s_init = a[-1]
3342 // br scalar.body
3343 //
3344 // scalar.body:
3345 // i = phi [0, scalar.ph], [i+1, scalar.body]
3346 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3347 // s2 = a[i]
3348 // b[i] = s2 - s1
3349 // br cond, scalar.body, ...
3350 //
3351 // In this example, s1 is a recurrence because it's value depends on the
3352 // previous iteration. In the first phase of vectorization, we created a
3353 // temporary value for s1. We now complete the vectorization and produce the
3354 // shorthand vector IR shown below (for VF = 4, UF = 1).
3355 //
3356 // vector.ph:
3357 // v_init = vector(..., ..., ..., a[-1])
3358 // br vector.body
3359 //
3360 // vector.body
3361 // i = phi [0, vector.ph], [i+4, vector.body]
3362 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3363 // v2 = a[i, i+1, i+2, i+3];
3364 // v3 = vector(v1(3), v2(0, 1, 2))
3365 // b[i, i+1, i+2, i+3] = v2 - v3
3366 // br cond, vector.body, middle.block
3367 //
3368 // middle.block:
3369 // x = v2(3)
3370 // br scalar.ph
3371 //
3372 // scalar.ph:
3373 // s_init = phi [x, middle.block], [a[-1], otherwise]
3374 // br scalar.body
3375 //
3376 // After execution completes the vector loop, we extract the next value of
3377 // the recurrence (x) to use as the initial value in the scalar loop.
3378
3379 // Get the original loop preheader and single loop latch.
3380 auto *Preheader = OrigLoop->getLoopPreheader();
3381 auto *Latch = OrigLoop->getLoopLatch();
3382
3383 // Get the initial and previous values of the scalar recurrence.
3384 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3385 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3386
3387 // Create a vector from the initial value.
3388 auto *VectorInit = ScalarInit;
3389 if (VF > 1) {
3390 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3391 VectorInit = Builder.CreateInsertElement(
3392 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3393 Builder.getInt32(VF - 1), "vector.recur.init");
3394 }
3395
3396 // We constructed a temporary phi node in the first phase of vectorization.
3397 // This phi node will eventually be deleted.
3398 Builder.SetInsertPoint(
3399 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3400
3401 // Create a phi node for the new recurrence. The current value will either be
3402 // the initial value inserted into a vector or loop-varying vector value.
3403 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3404 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3405
3406 // Get the vectorized previous value of the last part UF - 1. It appears last
3407 // among all unrolled iterations, due to the order of their construction.
3408 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3409
3410 // Set the insertion point after the previous value if it is an instruction.
3411 // Note that the previous value may have been constant-folded so it is not
3412 // guaranteed to be an instruction in the vector loop. Also, if the previous
3413 // value is a phi node, we should insert after all the phi nodes to avoid
3414 // breaking basic block verification.
3415 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3416 isa<PHINode>(PreviousLastPart))
3417 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3418 else
3419 Builder.SetInsertPoint(
3420 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3421
3422 // We will construct a vector for the recurrence by combining the values for
3423 // the current and previous iterations. This is the required shuffle mask.
3424 SmallVector<Constant *, 8> ShuffleMask(VF);
3425 ShuffleMask[0] = Builder.getInt32(VF - 1);
3426 for (unsigned I = 1; I < VF; ++I)
3427 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3428
3429 // The vector from which to take the initial value for the current iteration
3430 // (actual or unrolled). Initially, this is the vector phi node.
3431 Value *Incoming = VecPhi;
3432
3433 // Shuffle the current and previous vector and update the vector parts.
3434 for (unsigned Part = 0; Part < UF; ++Part) {
3435 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3436 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3437 auto *Shuffle =
3438 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3439 ConstantVector::get(ShuffleMask))
3440 : Incoming;
3441 PhiPart->replaceAllUsesWith(Shuffle);
3442 cast<Instruction>(PhiPart)->eraseFromParent();
3443 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3444 Incoming = PreviousPart;
3445 }
3446
3447 // Fix the latch value of the new recurrence in the vector loop.
3448 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3449
3450 // Extract the last vector element in the middle block. This will be the
3451 // initial value for the recurrence when jumping to the scalar loop.
3452 auto *ExtractForScalar = Incoming;
3453 if (VF > 1) {
3454 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3455 ExtractForScalar = Builder.CreateExtractElement(
3456 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3457 }
3458 // Extract the second last element in the middle block if the
3459 // Phi is used outside the loop. We need to extract the phi itself
3460 // and not the last element (the phi update in the current iteration). This
3461 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3462 // when the scalar loop is not run at all.
3463 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3464 if (VF > 1)
3465 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3466 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3467 // When loop is unrolled without vectorizing, initialize
3468 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3469 // `Incoming`. This is analogous to the vectorized case above: extracting the
3470 // second last element when VF > 1.
3471 else if (UF > 1)
3472 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3473
3474 // Fix the initial value of the original recurrence in the scalar loop.
3475 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3476 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3477 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3478 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3479 Start->addIncoming(Incoming, BB);
3480 }
3481
3482 Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
3483 Phi->setName("scalar.recur");
3484
3485 // Finally, fix users of the recurrence outside the loop. The users will need
3486 // either the last value of the scalar recurrence or the last value of the
3487 // vector recurrence we extracted in the middle block. Since the loop is in
3488 // LCSSA form, we just need to find all the phi nodes for the original scalar
3489 // recurrence in the exit block, and then add an edge for the middle block.
3490 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3491 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3492 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3493 }
3494 }
3495}
3496
3497void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3498 Constant *Zero = Builder.getInt32(0);
3499
3500 // Get it's reduction variable descriptor.
3501 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3502, __PRETTY_FUNCTION__))
3502 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3502, __PRETTY_FUNCTION__))
;
3503 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3504
3505 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3506 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3507 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3508 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3509 RdxDesc.getMinMaxRecurrenceKind();
3510 setDebugLocFromInst(Builder, ReductionStartValue);
3511
3512 // We need to generate a reduction vector from the incoming scalar.
3513 // To do so, we need to generate the 'identity' vector and override
3514 // one of the elements with the incoming scalar reduction. We need
3515 // to do it in the vector-loop preheader.
3516 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3517
3518 // This is the vector-clone of the value that leaves the loop.
3519 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3520
3521 // Find the reduction identity variable. Zero for addition, or, xor,
3522 // one for multiplication, -1 for And.
3523 Value *Identity;
3524 Value *VectorStart;
3525 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3526 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3527 // MinMax reduction have the start value as their identify.
3528 if (VF == 1) {
3529 VectorStart = Identity = ReductionStartValue;
3530 } else {
3531 VectorStart = Identity =
3532 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3533 }
3534 } else {
3535 // Handle other reduction kinds:
3536 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3537 RK, VecTy->getScalarType());
3538 if (VF == 1) {
3539 Identity = Iden;
3540 // This vector is the Identity vector where the first element is the
3541 // incoming scalar reduction.
3542 VectorStart = ReductionStartValue;
3543 } else {
3544 Identity = ConstantVector::getSplat(VF, Iden);
3545
3546 // This vector is the Identity vector where the first element is the
3547 // incoming scalar reduction.
3548 VectorStart =
3549 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3550 }
3551 }
3552
3553 // Fix the vector-loop phi.
3554
3555 // Reductions do not have to start at zero. They can start with
3556 // any loop invariant values.
3557 BasicBlock *Latch = OrigLoop->getLoopLatch();
3558 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3559 for (unsigned Part = 0; Part < UF; ++Part) {
3560 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3561 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3562 // Make sure to add the reduction stat value only to the
3563 // first unroll part.
3564 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3565 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3566 cast<PHINode>(VecRdxPhi)
3567 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3568 }
3569
3570 // Before each round, move the insertion point right between
3571 // the PHIs and the values we are going to write.
3572 // This allows us to write both PHINodes and the extractelement
3573 // instructions.
3574 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3575
3576 setDebugLocFromInst(Builder, LoopExitInst);
3577
3578 // If the vector reduction can be performed in a smaller type, we truncate
3579 // then extend the loop exit value to enable InstCombine to evaluate the
3580 // entire expression in the smaller type.
3581 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3582 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3583 Builder.SetInsertPoint(
3584 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3585 VectorParts RdxParts(UF);
3586 for (unsigned Part = 0; Part < UF; ++Part) {
3587 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3588 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3589 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3590 : Builder.CreateZExt(Trunc, VecTy);
3591 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3592 UI != RdxParts[Part]->user_end();)
3593 if (*UI != Trunc) {
3594 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3595 RdxParts[Part] = Extnd;
3596 } else {
3597 ++UI;
3598 }
3599 }
3600 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3601 for (unsigned Part = 0; Part < UF; ++Part) {
3602 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3603 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3604 }
3605 }
3606
3607 // Reduce all of the unrolled parts into a single vector.
3608 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3609 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3610 setDebugLocFromInst(Builder, ReducedPartRdx);
3611 for (unsigned Part = 1; Part < UF; ++Part) {
3612 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3613 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3614 // Floating point operations had to be 'fast' to enable the reduction.
3615 ReducedPartRdx = addFastMathFlag(
3616 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3617 ReducedPartRdx, "bin.rdx"),
3618 RdxDesc.getFastMathFlags());
3619 else
3620 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3621 RdxPart);
3622 }
3623
3624 if (VF > 1) {
3625 bool NoNaN = Legal->hasFunNoNaNAttr();
3626 ReducedPartRdx =
3627 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3628 // If the reduction can be performed in a smaller type, we need to extend
3629 // the reduction to the wider type before we branch to the original loop.
3630 if (Phi->getType() != RdxDesc.getRecurrenceType())
3631 ReducedPartRdx =
3632 RdxDesc.isSigned()
3633 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3634 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3635 }
3636
3637 // Create a phi node that merges control-flow from the backedge-taken check
3638 // block and the middle block.
3639 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3640 LoopScalarPreHeader->getTerminator());
3641 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3642 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3643 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3644
3645 // Now, we need to fix the users of the reduction variable
3646 // inside and outside of the scalar remainder loop.
3647 // We know that the loop is in LCSSA form. We need to update the
3648 // PHI nodes in the exit blocks.
3649 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3650 // All PHINodes need to have a single entry edge, or two if
3651 // we already fixed them.
3652 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3652, __PRETTY_FUNCTION__))
;
3653
3654 // We found a reduction value exit-PHI. Update it with the
3655 // incoming bypass edge.
3656 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3657 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3658 } // end of the LCSSA phi scan.
3659
3660 // Fix the scalar loop reduction variable with the incoming reduction sum
3661 // from the vector body and from the backedge value.
3662 int IncomingEdgeBlockIdx =
3663 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3664 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3664, __PRETTY_FUNCTION__))
;
3665 // Pick the other block.
3666 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3667 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3668 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3669}
3670
3671void InnerLoopVectorizer::fixLCSSAPHIs() {
3672 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3673 if (LCSSAPhi.getNumIncomingValues() == 1) {
3674 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3675 // Non-instruction incoming values will have only one value.
3676 unsigned LastLane = 0;
3677 if (isa<Instruction>(IncomingValue))
3678 LastLane = Cost->isUniformAfterVectorization(
3679 cast<Instruction>(IncomingValue), VF)
3680 ? 0
3681 : VF - 1;
3682 // Can be a loop invariant incoming value or the last scalar value to be
3683 // extracted from the vectorized loop.
3684 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3685 Value *lastIncomingValue =
3686 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3687 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3688 }
3689 }
3690}
3691
3692void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3693 // The basic block and loop containing the predicated instruction.
3694 auto *PredBB = PredInst->getParent();
3695 auto *VectorLoop = LI->getLoopFor(PredBB);
3696
3697 // Initialize a worklist with the operands of the predicated instruction.
3698 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3699
3700 // Holds instructions that we need to analyze again. An instruction may be
3701 // reanalyzed if we don't yet know if we can sink it or not.
3702 SmallVector<Instruction *, 8> InstsToReanalyze;
3703
3704 // Returns true if a given use occurs in the predicated block. Phi nodes use
3705 // their operands in their corresponding predecessor blocks.
3706 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3707 auto *I = cast<Instruction>(U.getUser());
3708 BasicBlock *BB = I->getParent();
3709 if (auto *Phi = dyn_cast<PHINode>(I))
3710 BB = Phi->getIncomingBlock(
3711 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3712 return BB == PredBB;
3713 };
3714
3715 // Iteratively sink the scalarized operands of the predicated instruction
3716 // into the block we created for it. When an instruction is sunk, it's
3717 // operands are then added to the worklist. The algorithm ends after one pass
3718 // through the worklist doesn't sink a single instruction.
3719 bool Changed;
3720 do {
3721 // Add the instructions that need to be reanalyzed to the worklist, and
3722 // reset the changed indicator.
3723 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3724 InstsToReanalyze.clear();
3725 Changed = false;
3726
3727 while (!Worklist.empty()) {
3728 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3729
3730 // We can't sink an instruction if it is a phi node, is already in the
3731 // predicated block, is not in the loop, or may have side effects.
3732 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3733 !VectorLoop->contains(I) || I->mayHaveSideEffects())
3734 continue;
3735
3736 // It's legal to sink the instruction if all its uses occur in the
3737 // predicated block. Otherwise, there's nothing to do yet, and we may
3738 // need to reanalyze the instruction.
3739 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3740 InstsToReanalyze.push_back(I);
3741 continue;
3742 }
3743
3744 // Move the instruction to the beginning of the predicated block, and add
3745 // it's operands to the worklist.
3746 I->moveBefore(&*PredBB->getFirstInsertionPt());
3747 Worklist.insert(I->op_begin(), I->op_end());
3748
3749 // The sinking may have enabled other instructions to be sunk, so we will
3750 // need to iterate.
3751 Changed = true;
3752 }
3753 } while (Changed);
3754}
3755
3756void InnerLoopVectorizer::fixNonInductionPHIs() {
3757 for (PHINode *OrigPhi : OrigPHIsToFix) {
3758 PHINode *NewPhi =
3759 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3760 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3761
3762 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3763 predecessors(OrigPhi->getParent()));
3764 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3765 predecessors(NewPhi->getParent()));
3766 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3767, __PRETTY_FUNCTION__))
3767 "Scalar and Vector BB should have the same number of predecessors")((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3767, __PRETTY_FUNCTION__))
;
3768
3769 // The insertion point in Builder may be invalidated by the time we get
3770 // here. Force the Builder insertion point to something valid so that we do
3771 // not run into issues during insertion point restore in
3772 // getOrCreateVectorValue calls below.
3773 Builder.SetInsertPoint(NewPhi);
3774
3775 // The predecessor order is preserved and we can rely on mapping between
3776 // scalar and vector block predecessors.
3777 for (unsigned i = 0; i < NumIncomingValues; ++i) {
3778 BasicBlock *NewPredBB = VectorBBPredecessors[i];
3779
3780 // When looking up the new scalar/vector values to fix up, use incoming
3781 // values from original phi.
3782 Value *ScIncV =
3783 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3784
3785 // Scalar incoming value may need a broadcast
3786 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3787 NewPhi->addIncoming(NewIncV, NewPredBB);
3788 }
3789 }
3790}
3791
3792void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3793 unsigned VF) {
3794 PHINode *P = cast<PHINode>(PN);
3795 if (EnableVPlanNativePath) {
3796 // Currently we enter here in the VPlan-native path for non-induction
3797 // PHIs where all control flow is uniform. We simply widen these PHIs.
3798 // Create a vector phi with no operands - the vector phi operands will be
3799 // set at the end of vector code generation.
3800 Type *VecTy =
3801 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3802 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3803 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3804 OrigPHIsToFix.push_back(P);
3805
3806 return;
3807 }
3808
3809 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3810, __PRETTY_FUNCTION__))
3810 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3810, __PRETTY_FUNCTION__))
;
3811
3812 // In order to support recurrences we need to be able to vectorize Phi nodes.
3813 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3814 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3815 // this value when we vectorize all of the instructions that use the PHI.
3816 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3817 for (unsigned Part = 0; Part < UF; ++Part) {
3818 // This is phase one of vectorizing PHIs.
3819 Type *VecTy =
3820 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3821 Value *EntryPart = PHINode::Create(
3822 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3823 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3824 }
3825 return;
3826 }
3827
3828 setDebugLocFromInst(Builder, P);
3829
3830 // This PHINode must be an induction variable.
3831 // Make sure that we know about it.
3832 assert(Legal->getInductionVars()->count(P) && "Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3832, __PRETTY_FUNCTION__))
;
3833
3834 InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3835 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3836
3837 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3838 // which can be found from the original scalar operations.
3839 switch (II.getKind()) {
3840 case InductionDescriptor::IK_NoInduction:
3841 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3841)
;
3842 case InductionDescriptor::IK_IntInduction:
3843 case InductionDescriptor::IK_FpInduction:
3844 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3844)
;
3845 case InductionDescriptor::IK_PtrInduction: {
3846 // Handle the pointer induction variable case.
3847 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3847, __PRETTY_FUNCTION__))
;
3848 // This is the normalized GEP that starts counting at zero.
3849 Value *PtrInd = Induction;
3850 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3851 // Determine the number of scalars we need to generate for each unroll
3852 // iteration. If the instruction is uniform, we only need to generate the
3853 // first lane. Otherwise, we generate all VF values.
3854 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3855 // These are the scalar results. Notice that we don't generate vector GEPs
3856 // because scalar GEPs result in better code.
3857 for (unsigned Part = 0; Part < UF; ++Part) {
3858 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3859 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3860 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3861 Value *SclrGep =
3862 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3863 SclrGep->setName("next.gep");
3864 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3865 }
3866 }
3867 return;
3868 }
3869 }
3870}
3871
3872/// A helper function for checking whether an integer division-related
3873/// instruction may divide by zero (in which case it must be predicated if
3874/// executed conditionally in the scalar code).
3875/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3876/// Non-zero divisors that are non compile-time constants will not be
3877/// converted into multiplication, so we will still end up scalarizing
3878/// the division, but can do so w/o predication.
3879static bool mayDivideByZero(Instruction &I) {
3880 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3884, __PRETTY_FUNCTION__))
3881 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3884, __PRETTY_FUNCTION__))
3882 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3884, __PRETTY_FUNCTION__))
3883 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3884, __PRETTY_FUNCTION__))
3884 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3884, __PRETTY_FUNCTION__))
;
3885 Value *Divisor = I.getOperand(1);
3886 auto *CInt = dyn_cast<ConstantInt>(Divisor);
3887 return !CInt || CInt->isZero();
3888}
3889
3890void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3891 switch (I.getOpcode()) {
3892 case Instruction::Br:
3893 case Instruction::PHI:
3894 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3894)
;
3895 case Instruction::GetElementPtr: {
3896 // Construct a vector GEP by widening the operands of the scalar GEP as
3897 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3898 // results in a vector of pointers when at least one operand of the GEP
3899 // is vector-typed. Thus, to keep the representation compact, we only use
3900 // vector-typed operands for loop-varying values.
3901 auto *GEP = cast<GetElementPtrInst>(&I);
3902
3903 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3904 // If we are vectorizing, but the GEP has only loop-invariant operands,
3905 // the GEP we build (by only using vector-typed operands for
3906 // loop-varying values) would be a scalar pointer. Thus, to ensure we
3907 // produce a vector of pointers, we need to either arbitrarily pick an
3908 // operand to broadcast, or broadcast a clone of the original GEP.
3909 // Here, we broadcast a clone of the original.
3910 //
3911 // TODO: If at some point we decide to scalarize instructions having
3912 // loop-invariant operands, this special case will no longer be
3913 // required. We would add the scalarization decision to
3914 // collectLoopScalars() and teach getVectorValue() to broadcast
3915 // the lane-zero scalar value.
3916 auto *Clone = Builder.Insert(GEP->clone());
3917 for (unsigned Part = 0; Part < UF; ++Part) {
3918 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3919 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3920 addMetadata(EntryPart, GEP);
3921 }
3922 } else {
3923 // If the GEP has at least one loop-varying operand, we are sure to
3924 // produce a vector of pointers. But if we are only unrolling, we want
3925 // to produce a scalar GEP for each unroll part. Thus, the GEP we
3926 // produce with the code below will be scalar (if VF == 1) or vector
3927 // (otherwise). Note that for the unroll-only case, we still maintain
3928 // values in the vector mapping with initVector, as we do for other
3929 // instructions.
3930 for (unsigned Part = 0; Part < UF; ++Part) {
3931 // The pointer operand of the new GEP. If it's loop-invariant, we
3932 // won't broadcast it.
3933 auto *Ptr =
3934 OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3935 ? GEP->getPointerOperand()
3936 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3937
3938 // Collect all the indices for the new GEP. If any index is
3939 // loop-invariant, we won't broadcast it.
3940 SmallVector<Value *, 4> Indices;
3941 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3942 if (OrigLoop->isLoopInvariant(U.get()))
3943 Indices.push_back(U.get());
3944 else
3945 Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3946 }
3947
3948 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3949 // but it should be a vector, otherwise.
3950 auto *NewGEP =
3951 GEP->isInBounds()
3952 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3953 Indices)
3954 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
3955 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3956, __PRETTY_FUNCTION__))
3956 "NewGEP is not a pointer vector")(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3956, __PRETTY_FUNCTION__))
;
3957 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3958 addMetadata(NewGEP, GEP);
3959 }
3960 }
3961
3962 break;
3963 }
3964 case Instruction::UDiv:
3965 case Instruction::SDiv:
3966 case Instruction::SRem:
3967 case Instruction::URem:
3968 case Instruction::Add:
3969 case Instruction::FAdd:
3970 case Instruction::Sub:
3971 case Instruction::FSub:
3972 case Instruction::Mul:
3973 case Instruction::FMul:
3974 case Instruction::FDiv:
3975 case Instruction::FRem:
3976 case Instruction::Shl:
3977 case Instruction::LShr:
3978 case Instruction::AShr:
3979 case Instruction::And:
3980 case Instruction::Or:
3981 case Instruction::Xor: {
3982 // Just widen binops.
3983 auto *BinOp = cast<BinaryOperator>(&I);
3984 setDebugLocFromInst(Builder, BinOp);
3985
3986 for (unsigned Part = 0; Part < UF; ++Part) {
3987 Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3988 Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3989 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3990
3991 if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3992 VecOp->copyIRFlags(BinOp);
3993
3994 // Use this vector value for all users of the original instruction.
3995 VectorLoopValueMap.setVectorValue(&I, Part, V);
3996 addMetadata(V, BinOp);
3997 }
3998
3999 break;
4000 }
4001 case Instruction::Select: {
4002 // Widen selects.
4003 // If the selector is loop invariant we can create a select
4004 // instruction with a scalar condition. Otherwise, use vector-select.
4005 auto *SE = PSE.getSE();
4006 bool InvariantCond =
4007 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4008 setDebugLocFromInst(Builder, &I);
4009
4010 // The condition can be loop invariant but still defined inside the
4011 // loop. This means that we can't just use the original 'cond' value.
4012 // We have to take the 'vectorized' value and pick the first lane.
4013 // Instcombine will make this a no-op.
4014
4015 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4016
4017 for (unsigned Part = 0; Part < UF; ++Part) {
4018 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4019 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4020 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4021 Value *Sel =
4022 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4023 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4024 addMetadata(Sel, &I);
4025 }
4026
4027 break;
4028 }
4029
4030 case Instruction::ICmp:
4031 case Instruction::FCmp: {
4032 // Widen compares. Generate vector compares.
4033 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4034 auto *Cmp = dyn_cast<CmpInst>(&I);
4035 setDebugLocFromInst(Builder, Cmp);
4036 for (unsigned Part = 0; Part < UF; ++Part) {
4037 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4038 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4039 Value *C = nullptr;
4040 if (FCmp) {
4041 // Propagate fast math flags.
4042 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4043 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4044 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4045 } else {
4046 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4047 }
4048 VectorLoopValueMap.setVectorValue(&I, Part, C);
4049 addMetadata(C, &I);
4050 }
4051
4052 break;
4053 }
4054
4055 case Instruction::ZExt:
4056 case Instruction::SExt:
4057 case Instruction::FPToUI:
4058 case Instruction::FPToSI:
4059 case Instruction::FPExt:
4060 case Instruction::PtrToInt:
4061 case Instruction::IntToPtr:
4062 case Instruction::SIToFP:
4063 case Instruction::UIToFP:
4064 case Instruction::Trunc:
4065 case Instruction::FPTrunc:
4066 case Instruction::BitCast: {
4067 auto *CI = dyn_cast<CastInst>(&I);
4068 setDebugLocFromInst(Builder, CI);
4069
4070 /// Vectorize casts.
4071 Type *DestTy =
4072 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4073
4074 for (unsigned Part = 0; Part < UF; ++Part) {
4075 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4076 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4077 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4078 addMetadata(Cast, &I);
4079 }
4080 break;
4081 }
4082
4083 case Instruction::Call: {
4084 // Ignore dbg intrinsics.
4085 if (isa<DbgInfoIntrinsic>(I))
4086 break;
4087 setDebugLocFromInst(Builder, &I);
4088
4089 Module *M = I.getParent()->getParent()->getParent();
4090 auto *CI = cast<CallInst>(&I);
4091
4092 StringRef FnName = CI->getCalledFunction()->getName();
4093 Function *F = CI->getCalledFunction();
4094 Type *RetTy = ToVectorTy(CI->getType(), VF);
4095 SmallVector<Type *, 4> Tys;
4096 for (Value *ArgOperand : CI->arg_operands())
4097 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4098
4099 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4100
4101 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4102 // version of the instruction.
4103 // Is it beneficial to perform intrinsic call compared to lib call?
4104 bool NeedToScalarize;
4105 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4106 bool UseVectorIntrinsic =
4107 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4108 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4109, __PRETTY_FUNCTION__))
4109 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4109, __PRETTY_FUNCTION__))
;
4110
4111 for (unsigned Part = 0; Part < UF; ++Part) {
4112 SmallVector<Value *, 4> Args;
4113 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4114 Value *Arg = CI->getArgOperand(i);
4115 // Some intrinsics have a scalar argument - don't replace it with a
4116 // vector.
4117 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4118 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4119 Args.push_back(Arg);
4120 }
4121
4122 Function *VectorF;
4123 if (UseVectorIntrinsic) {
4124 // Use vector version of the intrinsic.
4125 Type *TysForDecl[] = {CI->getType()};
4126 if (VF > 1)
4127 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4128 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4129 } else {
4130 // Use vector version of the library call.
4131 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4132 assert(!VFnName.empty() && "Vector function name is empty.")((!VFnName.empty() && "Vector function name is empty."
) ? static_cast<void> (0) : __assert_fail ("!VFnName.empty() && \"Vector function name is empty.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4132, __PRETTY_FUNCTION__))
;
4133 VectorF = M->getFunction(VFnName);
4134 if (!VectorF) {
4135 // Generate a declaration
4136 FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4137 VectorF =
4138 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4139 VectorF->copyAttributesFrom(F);
4140 }
4141 }
4142 assert(VectorF && "Can't create vector function.")((VectorF && "Can't create vector function.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4142, __PRETTY_FUNCTION__))
;
4143
4144 SmallVector<OperandBundleDef, 1> OpBundles;
4145 CI->getOperandBundlesAsDefs(OpBundles);
4146 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4147
4148 if (isa<FPMathOperator>(V))
4149 V->copyFastMathFlags(CI);
4150
4151 VectorLoopValueMap.setVectorValue(&I, Part, V);
4152 addMetadata(V, &I);
4153 }
4154
4155 break;
4156 }
4157
4158 default:
4159 // This instruction is not vectorized by simple widening.
4160 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4161 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4161)
;
4162 } // end of switch.
4163}
4164
4165void InnerLoopVectorizer::updateAnalysis() {
4166 // Forget the original basic block.
4167 PSE.getSE()->forgetLoop(OrigLoop);
4168
4169 // DT is not kept up-to-date for outer loop vectorization
4170 if (EnableVPlanNativePath)
4171 return;
4172
4173 // Update the dominator tree information.
4174 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock
) && "Entry does not dominate exit.") ? static_cast<
void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4175, __PRETTY_FUNCTION__))
4175 "Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock
) && "Entry does not dominate exit.") ? static_cast<
void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4175, __PRETTY_FUNCTION__))
;
4176
4177 DT->addNewBlock(LoopMiddleBlock,
4178 LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4179 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4180 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4181 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4182 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4182, __PRETTY_FUNCTION__))
;
4183}
4184
4185void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4186 // We should not collect Scalars more than once per VF. Right now, this
4187 // function is called from collectUniformsAndScalars(), which already does
4188 // this check. Collecting Scalars for VF=1 does not make any sense.
4189 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4190, __PRETTY_FUNCTION__))
4190 "This function should not be visited twice for the same VF")((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4190, __PRETTY_FUNCTION__))
;
4191
4192 SmallSetVector<Instruction *, 8> Worklist;
4193
4194 // These sets are used to seed the analysis with pointers used by memory
4195 // accesses that will remain scalar.
4196 SmallSetVector<Instruction *, 8> ScalarPtrs;
4197 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4198
4199 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4200 // The pointer operands of loads and stores will be scalar as long as the
4201 // memory access is not a gather or scatter operation. The value operand of a
4202 // store will remain scalar if the store is scalarized.
4203 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4204 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4205 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
4206 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
;
4207 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4208 if (Ptr == Store->getValueOperand())
4209 return WideningDecision == CM_Scalarize;
4210 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4211, __PRETTY_FUNCTION__))
4211 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4211, __PRETTY_FUNCTION__))
;
4212 return WideningDecision != CM_GatherScatter;
4213 };
4214
4215 // A helper that returns true if the given value is a bitcast or
4216 // getelementptr instruction contained in the loop.
4217 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4218 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4219 isa<GetElementPtrInst>(V)) &&
4220 !TheLoop->isLoopInvariant(V);
4221 };
4222
4223 // A helper that evaluates a memory access's use of a pointer. If the use
4224 // will be a scalar use, and the pointer is only used by memory accesses, we
4225 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4226 // PossibleNonScalarPtrs.
4227 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4228 // We only care about bitcast and getelementptr instructions contained in
4229 // the loop.
4230 if (!isLoopVaryingBitCastOrGEP(Ptr))
4231 return;
4232
4233 // If the pointer has already been identified as scalar (e.g., if it was
4234 // also identified as uniform), there's nothing to do.
4235 auto *I = cast<Instruction>(Ptr);
4236 if (Worklist.count(I))
4237 return;
4238
4239 // If the use of the pointer will be a scalar use, and all users of the
4240 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4241 // place the pointer in PossibleNonScalarPtrs.
4242 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4243 return isa<LoadInst>(U) || isa<StoreInst>(U);
4244 }))
4245 ScalarPtrs.insert(I);
4246 else
4247 PossibleNonScalarPtrs.insert(I);
4248 };
4249
4250 // We seed the scalars analysis with three classes of instructions: (1)
4251 // instructions marked uniform-after-vectorization, (2) bitcast and
4252 // getelementptr instructions used by memory accesses requiring a scalar use,
4253 // and (3) pointer induction variables and their update instructions (we
4254 // currently only scalarize these).
4255 //
4256 // (1) Add to the worklist all instructions that have been identified as
4257 // uniform-after-vectorization.
4258 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4259
4260 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4261 // memory accesses requiring a scalar use. The pointer operands of loads and
4262 // stores will be scalar as long as the memory accesses is not a gather or
4263 // scatter operation. The value operand of a store will remain scalar if the
4264 // store is scalarized.
4265 for (auto *BB : TheLoop->blocks())
4266 for (auto &I : *BB) {
4267 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4268 evaluatePtrUse(Load, Load->getPointerOperand());
4269 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4270 evaluatePtrUse(Store, Store->getPointerOperand());
4271 evaluatePtrUse(Store, Store->getValueOperand());
4272 }
4273 }
4274 for (auto *I : ScalarPtrs)
4275 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4276 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4277 Worklist.insert(I);
4278 }
4279
4280 // (3) Add to the worklist all pointer induction variables and their update
4281 // instructions.
4282 //
4283 // TODO: Once we are able to vectorize pointer induction variables we should
4284 // no longer insert them into the worklist here.
4285 auto *Latch = TheLoop->getLoopLatch();
4286 for (auto &Induction : *Legal->getInductionVars()) {
4287 auto *Ind = Induction.first;
4288 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4289 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4290 continue;
4291 Worklist.insert(Ind);
4292 Worklist.insert(IndUpdate);
4293 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4294 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4295 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4296 }
4297
4298 // Insert the forced scalars.
4299 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4300 // induction variable when the PHI user is scalarized.
4301 auto ForcedScalar = ForcedScalars.find(VF);
4302 if (ForcedScalar != ForcedScalars.end())
4303 for (auto *I : ForcedScalar->second)
4304 Worklist.insert(I);
4305
4306 // Expand the worklist by looking through any bitcasts and getelementptr
4307 // instructions we've already identified as scalar. This is similar to the
4308 // expansion step in collectLoopUniforms(); however, here we're only
4309 // expanding to include additional bitcasts and getelementptr instructions.
4310 unsigned Idx = 0;
4311 while (Idx != Worklist.size()) {
4312 Instruction *Dst = Worklist[Idx++];
4313 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4314 continue;
4315 auto *Src = cast<Instruction>(Dst->getOperand(0));
4316 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4317 auto *J = cast<Instruction>(U);
4318 return !TheLoop->contains(J) || Worklist.count(J) ||
4319 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4320 isScalarUse(J, Src));
4321 })) {
4322 Worklist.insert(Src);
4323 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4324 }
4325 }
4326
4327 // An induction variable will remain scalar if all users of the induction
4328 // variable and induction variable update remain scalar.
4329 for (auto &Induction : *Legal->getInductionVars()) {
4330 auto *Ind = Induction.first;
4331 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4332
4333 // We already considered pointer induction variables, so there's no reason
4334 // to look at their users again.
4335 //
4336 // TODO: Once we are able to vectorize pointer induction variables we
4337 // should no longer skip over them here.
4338 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4339 continue;
4340
4341 // Determine if all users of the induction variable are scalar after
4342 // vectorization.
4343 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4344 auto *I = cast<Instruction>(U);
4345 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4346 });
4347 if (!ScalarInd)
4348 continue;
4349
4350 // Determine if all users of the induction variable update instruction are
4351 // scalar after vectorization.
4352 auto ScalarIndUpdate =
4353 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4354 auto *I = cast<Instruction>(U);
4355 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4356 });
4357 if (!ScalarIndUpdate)
4358 continue;
4359
4360 // The induction variable and its update instruction will remain scalar.
4361 Worklist.insert(Ind);
4362 Worklist.insert(IndUpdate);
4363 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4364 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4365 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4366 }
4367
4368 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4371bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4372 if (!blockNeedsPredication(I->getParent()))
4373 return false;
4374 switch(I->getOpcode()) {
4375 default:
4376 break;
4377 case Instruction::Load:
4378 case Instruction::Store: {
4379 if (!Legal->isMaskRequired(I))
4380 return false;
4381 auto *Ptr = getLoadStorePointerOperand(I);
4382 auto *Ty = getMemInstValueType(I);
4383 // We have already decided how to vectorize this instruction, get that
4384 // result.
4385 if (VF > 1) {
4386 InstWidening WideningDecision = getWideningDecision(I, VF);
4387 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4388, __PRETTY_FUNCTION__))
4388 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4388, __PRETTY_FUNCTION__))
;
4389 return WideningDecision == CM_Scalarize;
4390 }
4391 return isa<LoadInst>(I) ?
4392 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4393 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4394 }
4395 case Instruction::UDiv:
4396 case Instruction::SDiv:
4397 case Instruction::SRem:
4398 case Instruction::URem:
4399 return mayDivideByZero(*I);
4400 }
4401 return false;
4402}
4403
4404bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4405 unsigned VF) {
4406 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4406, __PRETTY_FUNCTION__))
;
4407 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4408, __PRETTY_FUNCTION__))
4408 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4408, __PRETTY_FUNCTION__))
;
4409 auto *Group = getInterleavedAccessGroup(I);
4410 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4410, __PRETTY_FUNCTION__))
;
4411
4412 // Check if masking is required.
4413 // A Group may need masking for one of two reasons: it resides in a block that
4414 // needs predication, or it was decided to use masking to deal with gaps.
4415 bool PredicatedAccessRequiresMasking =
4416 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4417 bool AccessWithGapsRequiresMasking =
4418 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4419 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4420 return true;
4421
4422 // If masked interleaving is required, we expect that the user/target had
4423 // enabled it, because otherwise it either wouldn't have been created or
4424 // it should have been invalidated by the CostModel.
4425 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4426, __PRETTY_FUNCTION__))
4426 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4426, __PRETTY_FUNCTION__))
;
4427
4428 auto *Ty = getMemInstValueType(I);
4429 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4430 : TTI.isLegalMaskedStore(Ty);
4431}
4432
4433bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4434 unsigned VF) {
4435 // Get and ensure we have a valid memory instruction.
4436 LoadInst *LI = dyn_cast<LoadInst>(I);
4437 StoreInst *SI = dyn_cast<StoreInst>(I);
4438 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4438, __PRETTY_FUNCTION__))
;
4439
4440 auto *Ptr = getLoadStorePointerOperand(I);
4441
4442 // In order to be widened, the pointer should be consecutive, first of all.
4443 if (!Legal->isConsecutivePtr(Ptr))
4444 return false;
4445
4446 // If the instruction is a store located in a predicated block, it will be
4447 // scalarized.
4448 if (isScalarWithPredication(I))
4449 return false;
4450
4451 // If the instruction's allocated size doesn't equal it's type size, it
4452 // requires padding and will be scalarized.
4453 auto &DL = I->getModule()->getDataLayout();
4454 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4455 if (hasIrregularType(ScalarTy, DL, VF))
4456 return false;
4457
4458 return true;
4459}
4460
4461void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4462 // We should not collect Uniforms more than once per VF. Right now,
4463 // this function is called from collectUniformsAndScalars(), which
4464 // already does this check. Collecting Uniforms for VF=1 does not make any
4465 // sense.
4466
4467 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4468, __PRETTY_FUNCTION__))
4468 "This function should not be visited twice for the same VF")((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4468, __PRETTY_FUNCTION__))
;
4469
4470 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4471 // not analyze again. Uniforms.count(VF) will return 1.
4472 Uniforms[VF].clear();
4473
4474 // We now know that the loop is vectorizable!
4475 // Collect instructions inside the loop that will remain uniform after
4476 // vectorization.
4477
4478 // Global values, params and instructions outside of current loop are out of
4479 // scope.
4480 auto isOutOfScope = [&](Value *V) -> bool {
4481 Instruction *I = dyn_cast<Instruction>(V);
4482 return (!I || !TheLoop->contains(I));
4483 };
4484
4485 SetVector<Instruction *> Worklist;
4486 BasicBlock *Latch = TheLoop->getLoopLatch();
4487
4488 // Start with the conditional branch. If the branch condition is an
4489 // instruction contained in the loop that is only used by the branch, it is
4490 // uniform.
4491 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4492 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4493 Worklist.insert(Cmp);
4494 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *Cmp << "\n"; } } while (false)
;
4495 }
4496
4497 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4498 // are pointers that are treated like consecutive pointers during
4499 // vectorization. The pointer operands of interleaved accesses are an
4500 // example.
4501 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4502
4503 // Holds pointer operands of instructions that are possibly non-uniform.
4504 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4505
4506 auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4507 InstWidening WideningDecision = getWideningDecision(I, VF);
4508 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4509, __PRETTY_FUNCTION__))
4509 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4509, __PRETTY_FUNCTION__))
;
4510
4511 return (WideningDecision == CM_Widen ||
4512 WideningDecision == CM_Widen_Reverse ||
4513 WideningDecision == CM_Interleave);
4514 };
4515 // Iterate over the instructions in the loop, and collect all
4516 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4517 // that a consecutive-like pointer operand will be scalarized, we collect it
4518 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4519 // getelementptr instruction can be used by both vectorized and scalarized
4520 // memory instructions. For example, if a loop loads and stores from the same
4521 // location, but the store is conditional, the store will be scalarized, and
4522 // the getelementptr won't remain uniform.
4523 for (auto *BB : TheLoop->blocks())
4524 for (auto &I : *BB) {
4525 // If there's no pointer operand, there's nothing to do.
4526 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4527 if (!Ptr)
4528 continue;
4529
4530 // True if all users of Ptr are memory accesses that have Ptr as their
4531 // pointer operand.
4532 auto UsersAreMemAccesses =
4533 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4534 return getLoadStorePointerOperand(U) == Ptr;
4535 });
4536
4537 // Ensure the memory instruction will not be scalarized or used by
4538 // gather/scatter, making its pointer operand non-uniform. If the pointer
4539 // operand is used by any instruction other than a memory access, we
4540 // conservatively assume the pointer operand may be non-uniform.
4541 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4542 PossibleNonUniformPtrs.insert(Ptr);
4543
4544 // If the memory instruction will be vectorized and its pointer operand
4545 // is consecutive-like, or interleaving - the pointer operand should
4546 // remain uniform.
4547 else
4548 ConsecutiveLikePtrs.insert(Ptr);
4549 }
4550
4551 // Add to the Worklist all consecutive and consecutive-like pointers that
4552 // aren't also identified as possibly non-uniform.
4553 for (auto *V : ConsecutiveLikePtrs)
4554 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4555 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *V << "\n"; } } while (false)
;
4556 Worklist.insert(V);
4557 }
4558
4559 // Expand Worklist in topological order: whenever a new instruction
4560 // is added , its users should be already inside Worklist. It ensures
4561 // a uniform instruction will only be used by uniform instructions.
4562 unsigned idx = 0;
4563 while (idx != Worklist.size()) {
4564 Instruction *I = Worklist[idx++];
4565
4566 for (auto OV : I->operand_values()) {
4567 // isOutOfScope operands cannot be uniform instructions.
4568 if (isOutOfScope(OV))
4569 continue;
4570 // First order recurrence Phi's should typically be considered
4571 // non-uniform.
4572 auto *OP = dyn_cast<PHINode>(OV);
4573 if (OP && Legal->isFirstOrderRecurrence(OP))
4574 continue;
4575 // If all the users of the operand are uniform, then add the
4576 // operand into the uniform worklist.
4577 auto *OI = cast<Instruction>(OV);
4578 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4579 auto *J = cast<Instruction>(U);
4580 return Worklist.count(J) ||
4581 (OI == getLoadStorePointerOperand(J) &&
4582 isUniformDecision(J, VF));
4583 })) {
4584 Worklist.insert(OI);
4585 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *OI << "\n"; } } while (false)
;
4586 }
4587 }
4588 }
4589
4590 // Returns true if Ptr is the pointer operand of a memory access instruction
4591 // I, and I is known to not require scalarization.
4592 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4593 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4594 };
4595
4596 // For an instruction to be added into Worklist above, all its users inside
4597 // the loop should also be in Worklist. However, this condition cannot be
4598 // true for phi nodes that form a cyclic dependence. We must process phi
4599 // nodes separately. An induction variable will remain uniform if all users
4600 // of the induction variable and induction variable update remain uniform.
4601 // The code below handles both pointer and non-pointer induction variables.
4602 for (auto &Induction : *Legal->getInductionVars()) {
4603 auto *Ind = Induction.first;
4604 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4605
4606 // Determine if all users of the induction variable are uniform after
4607 // vectorization.
4608 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4609 auto *I = cast<Instruction>(U);
4610 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4611 isVectorizedMemAccessUse(I, Ind);
4612 });
4613 if (!UniformInd)
4614 continue;
4615
4616 // Determine if all users of the induction variable update instruction are
4617 // uniform after vectorization.
4618 auto UniformIndUpdate =
4619 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4620 auto *I = cast<Instruction>(U);
4621 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4622 isVectorizedMemAccessUse(I, IndUpdate);
4623 });
4624 if (!UniformIndUpdate)
4625 continue;
4626
4627 // The induction variable and its update instruction will remain uniform.
4628 Worklist.insert(Ind);
4629 Worklist.insert(IndUpdate);
4630 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *Ind << "\n"; } } while (false)
;
4631 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *IndUpdate << "\n"; } } while (false)
4632 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4633 }
4634
4635 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4636}
4637
4638Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4639 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4640 // TODO: It may by useful to do since it's still likely to be dynamically
4641 // uniform if the target can skip.
4642 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not inserting runtime ptr check for divergent target"
; } } while (false)
4643 dbgs() << "LV: Not inserting runtime ptr check for divergent target")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not inserting runtime ptr check for divergent target"
; } } while (false)
;
4644
4645 ORE->emit(
4646 createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4647 << "runtime pointer checks needed. Not enabled for divergent target");
4648
4649 return None;
4650 }
4651
4652 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4653 if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4654 return computeFeasibleMaxVF(OptForSize, TC);
4655
4656 if (Legal->getRuntimePointerChecking()->Need) {
4657 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4658 << "runtime pointer checks needed. Enable vectorization of this "
4659 "loop with '#pragma clang loop vectorize(enable)' when "
4660 "compiling with -Os/-Oz");
4661 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false)
4662 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false)
4663 << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false)
;
4664 return None;
4665 }
4666
4667 if (!PSE.getUnionPredicate().getPredicates().empty()) {
4668 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4669 << "runtime SCEV checks needed. Enable vectorization of this "
4670 "loop with '#pragma clang loop vectorize(enable)' when "
4671 "compiling with -Os/-Oz");
4672 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"
; } } while (false)
4673 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"
; } } while (false)
4674 << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"
; } } while (false)
;
4675 return None;
4676 }
4677
4678 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4679 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4680 ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4681 << "runtime stride == 1 checks needed. Enable vectorization of "
4682 "this loop with '#pragma clang loop vectorize(enable)' when "
4683 "compiling with -Os/-Oz");
4684 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"
; } } while (false)
4685 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"
; } } while (false)
4686 << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"
; } } while (false)
;
4687 return None;
4688 }
4689
4690 // If we optimize the program for size, avoid creating the tail loop.
4691 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
4692
4693 if (TC == 1) {
4694 ORE->emit(createMissedAnalysis("SingleIterationLoop")
4695 << "loop trip count is one, irrelevant for vectorization");
4696 LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting, single iteration (non) loop.\n"
; } } while (false)
;
4697 return None;
4698 }
4699
4700 // Record that scalar epilogue is not allowed.
4701 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
4702
4703 IsScalarEpilogueAllowed = !OptForSize;
4704
4705 // We don't create an epilogue when optimizing for size.
4706 // Invalidate interleave groups that require an epilogue if we can't mask
4707 // the interleave-group.
4708 if (!useMaskedInterleavedAccesses(TTI))
4709 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4710
4711 unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4712
4713 if (TC > 0 && TC % MaxVF == 0) {
4714 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
4715 return MaxVF;
4716 }
4717
4718 // If we don't know the precise trip count, or if the trip count that we
4719 // found modulo the vectorization factor is not zero, try to fold the tail
4720 // by masking.
4721 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4722 if (Legal->canFoldTailByMasking()) {
4723 FoldTailByMasking = true;
4724 return MaxVF;
4725 }
4726
4727 if (TC == 0) {
4728 ORE->emit(
4729 createMissedAnalysis("UnknownLoopCountComplexCFG")
4730 << "unable to calculate the loop count due to complex control flow");
4731 return None;
4732 }
4733
4734 ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4735 << "cannot optimize for size and vectorize at the same time. "
4736 "Enable vectorization of this loop with '#pragma clang loop "
4737 "vectorize(enable)' when compiling with -Os/-Oz");
4738 return None;
4739}
4740
4741unsigned
4742LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4743 unsigned ConstTripCount) {
4744 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4745 unsigned SmallestType, WidestType;
4746 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4747 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4748
4749 // Get the maximum safe dependence distance in bits computed by LAA.
4750 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4751 // the memory accesses that is most restrictive (involved in the smallest
4752 // dependence distance).
4753 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4754
4755 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4756
4757 unsigned MaxVectorSize = WidestRegister / WidestType;
4758
4759 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
4760 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
4761 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
4762 << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
;
4763
4764 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
4765 " into one vector!")((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
;
4766 if (MaxVectorSize == 0) {
4767 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false)
;
4768 MaxVectorSize = 1;
4769 return MaxVectorSize;
4770 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4771 isPowerOf2_32(ConstTripCount)) {
4772 // We need to clamp the VF to be the ConstTripCount. There is no point in
4773 // choosing a higher viable VF as done in the loop below.
4774 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
4775 << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
;
4776 MaxVectorSize = ConstTripCount;
4777 return MaxVectorSize;
4778 }
4779
4780 unsigned MaxVF = MaxVectorSize;
4781 if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4782 (MaximizeBandwidth && !OptForSize)) {
4783 // Collect all viable vectorization factors larger than the default MaxVF
4784 // (i.e. MaxVectorSize).
4785 SmallVector<unsigned, 8> VFs;
4786 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4787 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4788 VFs.push_back(VS);
4789
4790 // For each VF calculate its register usage.
4791 auto RUs = calculateRegisterUsage(VFs);
4792
4793 // Select the largest VF which doesn't require more registers than existing
4794 // ones.
4795 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4796 for (int i = RUs.size() - 1; i >= 0; --i) {
4797 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4798 MaxVF = VFs[i];
4799 break;
4800 }
4801 }
4802 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4803 if (MaxVF < MinVF) {
4804 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
4805 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
4806 MaxVF = MinVF;
4807 }
4808 }
4809 }
4810 return MaxVF;
4811}
4812
4813VectorizationFactor
4814LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4815 float Cost = expectedCost(1).first;
4816 const float ScalarCost = Cost;
4817 unsigned Width = 1;
4818 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< (int)ScalarCost << ".\n"; } } while (false)
;
4819
4820 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4821 if (ForceVectorization && MaxVF > 1) {
4822 // Ignore scalar width, because the user explicitly wants vectorization.
4823 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4824 // evaluation.
4825 Cost = std::numeric_limits<float>::max();
4826 }
4827
4828 for (unsigned i = 2; i <= MaxVF; i *= 2) {
4829 // Notice that the vector loop needs to be executed less times, so
4830 // we need to divide the cost of the vector loops by the width of
4831 // the vector elements.
4832 VectorizationCostTy C = expectedCost(i);
4833 float VectorCost = C.first / (float)i;
4834 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
4835 << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
;
4836 if (!C.second && !ForceVectorization) {
4837 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
4838 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
4839 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
4840 continue;
4841 }
4842 if (VectorCost < Cost) {
4843 Cost = VectorCost;
4844 Width = i;
4845 }
4846 }
4847
4848 if (!EnableCondStoresVectorization && NumPredStores) {
4849 ORE->emit(createMissedAnalysis("ConditionalStore")
4850 << "store that is conditionally executed prevents vectorization");
4851 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n"
; } } while (false)
4852 dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n"
; } } while (false)
;
4853 Width = 1;
4854 Cost = ScalarCost;
4855 }
4856
4857 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
4858 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
4859 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
4860 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
Width << ".\n"; } } while (false)
;
4861 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4862 return Factor;
4863}
4864
4865std::pair<unsigned, unsigned>
4866LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4867 unsigned MinWidth = -1U;
4868 unsigned MaxWidth = 8;
4869 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4870
4871 // For each block.
4872 for (BasicBlock *BB : TheLoop->blocks()) {
4873 // For each instruction in the loop.
4874 for (Instruction &I : BB->instructionsWithoutDebug()) {
4875 Type *T = I.getType();
4876
4877 // Skip ignored values.
4878 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4879 continue;
4880
4881 // Only examine Loads, Stores and PHINodes.
4882 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4883 continue;
4884
4885 // Examine PHI nodes that are reduction variables. Update the type to
4886 // account for the recurrence type.
4887 if (auto *PN = dyn_cast<PHINode>(&I)) {
4888 if (!Legal->isReductionVariable(PN))
4889 continue;
4890 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4891 T = RdxDesc.getRecurrenceType();
4892 }
4893
4894 // Examine the stored values.
4895 if (auto *ST = dyn_cast<StoreInst>(&I))
4896 T = ST->getValueOperand()->getType();
4897
4898 // Ignore loaded pointer types and stored pointer types that are not
4899 // vectorizable.
4900 //
4901 // FIXME: The check here attempts to predict whether a load or store will
4902 // be vectorized. We only know this for certain after a VF has
4903 // been selected. Here, we assume that if an access can be
4904 // vectorized, it will be. We should also look at extending this
4905 // optimization to non-pointer types.
4906 //
4907 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4908 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4909 continue;
4910
4911 MinWidth = std::min(MinWidth,
4912 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4913 MaxWidth = std::max(MaxWidth,
4914 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4915 }
4916 }
4917
4918 return {MinWidth, MaxWidth};
4919}
4920
4921unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4922 unsigned VF,
4923 unsigned LoopCost) {
4924 // -- The interleave heuristics --
4925 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4926 // There are many micro-architectural considerations that we can't predict
4927 // at this level. For example, frontend pressure (on decode or fetch) due to
4928 // code size, or the number and capabilities of the execution ports.
4929 //
4930 // We use the following heuristics to select the interleave count:
4931 // 1. If the code has reductions, then we interleave to break the cross
4932 // iteration dependency.
4933 // 2. If the loop is really small, then we interleave to reduce the loop
4934 // overhead.
4935 // 3. We don't interleave if we think that we will spill registers to memory
4936 // due to the increased register pressure.
4937
4938 // When we optimize for size, we don't interleave.
4939 if (OptForSize)
30
Taking false branch
4940 return 1;
4941
4942 // We used the distance for the interleave count.
4943 if (Legal->getMaxSafeDepDistBytes() != -1U)
31
Assuming the condition is false
32
Taking false branch
4944 return 1;
4945
4946 // Do not interleave loops with a relatively small trip count.
4947 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4948 if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
33
Assuming 'TC' is <= 1
4949 return 1;
4950
4951 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4952 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers\n"; } } while (false
)
34
Assuming 'DebugFlag' is 0
35
Loop condition is false. Exiting loop
4953 << " registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers\n"; } } while (false
)
;
4954
4955 if (VF == 1) {
36
Taking false branch
4956 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4957 TargetNumRegisters = ForceTargetNumScalarRegs;
4958 } else {
4959 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
37
Assuming the condition is false
38
Taking false branch
4960 TargetNumRegisters = ForceTargetNumVectorRegs;
4961 }
4962
4963 RegisterUsage R = calculateRegisterUsage({VF})[0];
4964 // We divide by these constants so assume that we have at least one
4965 // instruction that uses at least one register.
4966 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4967
4968 // We calculate the interleave count using the following formula.
4969 // Subtract the number of loop invariants from the number of available
4970 // registers. These registers are used by all of the interleaved instances.
4971 // Next, divide the remaining registers by the number of registers that is
4972 // required by the loop, in order to estimate how many parallel instances
4973 // fit without causing spills. All of this is rounded down if necessary to be
4974 // a power of two. We want power of two interleave count to simplify any
4975 // addressing operations or alignment considerations.
4976 // We also want power of two interleave counts to ensure that the induction
4977 // variable of the vector loop wraps to zero, when tail is folded by masking;
4978 // this currently happens when OptForSize, in which case IC is set to 1 above.
4979 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4980 R.MaxLocalUsers);
4981
4982 // Don't count the induction variable as interleaved.
4983 if (EnableIndVarRegisterHeur)
39
Assuming the condition is false
40
Taking false branch
4984 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4985 std::max(1U, (R.MaxLocalUsers - 1)));
4986
4987 // Clamp the interleave ranges to reasonable counts.
4988 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4989
4990 // Check if the user has overridden the max.
4991 if (VF == 1) {
41
Taking false branch
4992 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4993 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4994 } else {
4995 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
42
Assuming the condition is false
43
Taking false branch
4996 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4997 }
4998
4999 // If we did not calculate the cost for VF (because the user selected the VF)
5000 // then we calculate the cost of VF here.
5001 if (LoopCost == 0)
44
Taking true branch
5002 LoopCost = expectedCost(VF).first;
45
The value 0 is assigned to 'LoopCost'
5003
5004 // Clamp the calculated IC to be between the 1 and the max interleave count
5005 // that the target allows.
5006 if (IC > MaxInterleaveCount)
46
Assuming 'IC' is <= 'MaxInterleaveCount'
47
Taking false branch
5007 IC = MaxInterleaveCount;
5008 else if (IC < 1)
48
Assuming 'IC' is >= 1
49
Taking false branch
5009 IC = 1;
5010
5011 // Interleave if we vectorized this loop and there is a reduction that could
5012 // benefit from interleaving.
5013 if (VF > 1 && !Legal->getReductionVars()->empty()) {
50
Assuming the condition is false
51
Taking false branch
5014 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
5015 return IC;
5016 }
5017
5018 // Note that if we've already vectorized the loop we will have done the
5019 // runtime check and so interleaving won't require further checks.
5020 bool InterleavingRequiresRuntimePointerCheck =
5021 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5022
5023 // We want to interleave small loops in order to reduce the loop overhead and
5024 // potentially expose ILP opportunities.
5025 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n'; } } while (false)
;
52
Assuming 'DebugFlag' is 0
53
Loop condition is false. Exiting loop
5026 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
54
Assuming the condition is true
55
Taking true branch
5027 // We assume that the cost overhead is 1 and we use the cost model
5028 // to estimate the cost of the loop and interleave until the cost of the
5029 // loop overhead is about 5% of the cost of the loop.
5030 unsigned SmallIC =
5031 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
56
Division by zero
5032
5033 // Interleave until store/load ports (estimated by max interleave count) are
5034 // saturated.
5035 unsigned NumStores = Legal->getNumStores();
5036 unsigned NumLoads = Legal->getNumLoads();
5037 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5038 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5039
5040 // If we have a scalar reduction (vector reductions are already dealt with
5041 // by this point), we can increase the critical path length if the loop
5042 // we're interleaving is inside another loop. Limit, by default to 2, so the
5043 // critical path only gets increased by one reduction operation.
5044 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5045 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5046 SmallIC = std::min(SmallIC, F);
5047 StoresIC = std::min(StoresIC, F);
5048 LoadsIC = std::min(LoadsIC, F);
5049 }
5050
5051 if (EnableLoadStoreRuntimeInterleave &&
5052 std::max(StoresIC, LoadsIC) > SmallIC) {
5053 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
5054 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
5055 return std::max(StoresIC, LoadsIC);
5056 }
5057
5058 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
5059 return SmallIC;
5060 }
5061
5062 // Interleave if this is a large loop (small loops are already dealt with by
5063 // this point) that could benefit from interleaving.
5064 bool HasReductions = !Legal->getReductionVars()->empty();
5065 if (TTI.enableAggressiveInterleaving(HasReductions)) {
5066 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5067 return IC;
5068 }
5069
5070 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
5071 return 1;
5072}
5073
5074SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5075LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5076 // This function calculates the register usage by measuring the highest number
5077 // of values that are alive at a single location. Obviously, this is a very
5078 // rough estimation. We scan the loop in a topological order in order and
5079 // assign a number to each instruction. We use RPO to ensure that defs are
5080 // met before their users. We assume that each instruction that has in-loop
5081 // users starts an interval. We record every time that an in-loop value is
5082 // used, so we have a list of the first and last occurrences of each
5083 // instruction. Next, we transpose this data structure into a multi map that
5084 // holds the list of intervals that *end* at a specific location. This multi
5085 // map allows us to perform a linear search. We scan the instructions linearly
5086 // and record each time that a new interval starts, by placing it in a set.
5087 // If we find this value in the multi-map then we remove it from the set.
5088 // The max register usage is the maximum size of the set.
5089 // We also search for instructions that are defined outside the loop, but are
5090 // used inside the loop. We need this number separately from the max-interval
5091 // usage number because when we unroll, loop-invariant values do not take
5092 // more register.
5093 LoopBlocksDFS DFS(TheLoop);
5094 DFS.perform(LI);
5095
5096 RegisterUsage RU;
5097
5098 // Each 'key' in the map opens a new interval. The values
5099 // of the map are the index of the 'last seen' usage of the
5100 // instruction that is the key.
5101 using IntervalMap = DenseMap<Instruction *, unsigned>;
5102
5103 // Maps instruction to its index.
5104 SmallVector<Instruction *, 64> IdxToInstr;
5105 // Marks the end of each interval.
5106 IntervalMap EndPoint;
5107 // Saves the list of instruction indices that are used in the loop.
5108 SmallPtrSet<Instruction *, 8> Ends;
5109 // Saves the list of values that are used in the loop but are
5110 // defined outside the loop, such as arguments and constants.
5111 SmallPtrSet<Value *, 8> LoopInvariants;
5112
5113 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5114 for (Instruction &I : BB->instructionsWithoutDebug()) {
5115 IdxToInstr.push_back(&I);
5116
5117 // Save the end location of each USE.
5118 for (Value *U : I.operands()) {
5119 auto *Instr = dyn_cast<Instruction>(U);
5120
5121 // Ignore non-instruction values such as arguments, constants, etc.
5122 if (!Instr)
5123 continue;
5124
5125 // If this instruction is outside the loop then record it and continue.
5126 if (!TheLoop->contains(Instr)) {
5127 LoopInvariants.insert(Instr);
5128 continue;
5129 }
5130
5131 // Overwrite previous end points.
5132 EndPoint[Instr] = IdxToInstr.size();
5133 Ends.insert(Instr);
5134 }
5135 }
5136 }
5137
5138 // Saves the list of intervals that end with the index in 'key'.
5139 using InstrList = SmallVector<Instruction *, 2>;
5140 DenseMap<unsigned, InstrList> TransposeEnds;
5141
5142 // Transpose the EndPoints to a list of values that end at each index.
5143 for (auto &Interval : EndPoint)
5144 TransposeEnds[Interval.second].push_back(Interval.first);
5145
5146 SmallPtrSet<Instruction *, 8> OpenIntervals;
5147
5148 // Get the size of the widest register.
5149 unsigned MaxSafeDepDist = -1U;
5150 if (Legal->getMaxSafeDepDistBytes() != -1U)
5151 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5152 unsigned WidestRegister =
5153 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5154 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5155
5156 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5157 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5158
5159 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
5160
5161 // A lambda that gets the register usage for the given type and VF.
5162 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5163 if (Ty->isTokenTy())
5164 return 0U;
5165 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5166 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5167 };
5168
5169 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5170 Instruction *I = IdxToInstr[i];
5171
5172 // Remove all of the instructions that end at this location.
5173 InstrList &List = TransposeEnds[i];
5174 for (Instruction *ToRemove : List)
5175 OpenIntervals.erase(ToRemove);
5176
5177 // Ignore instructions that are never used within the loop.
5178 if (Ends.find(I) == Ends.end())
5179 continue;
5180
5181 // Skip ignored values.
5182 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5183 continue;
5184
5185 // For each VF find the maximum usage of registers.
5186 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5187 if (VFs[j] == 1) {
5188 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5189 continue;
5190 }
5191 collectUniformsAndScalars(VFs[j]);
5192 // Count the number of live intervals.
5193 unsigned RegUsage = 0;
5194 for (auto Inst : OpenIntervals) {
5195 // Skip ignored values for VF > 1.
5196 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5197 isScalarAfterVectorization(Inst, VFs[j]))
5198 continue;
5199 RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5200 }
5201 MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5202 }
5203
5204 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
5205 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
5206
5207 // Add the current instruction to the list of open intervals.
5208 OpenIntervals.insert(I);
5209 }
5210
5211 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5212 unsigned Invariant = 0;
5213 if (VFs[i] == 1)
5214 Invariant = LoopInvariants.size();
5215 else {
5216 for (auto Inst : LoopInvariants)
5217 Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5218 }
5219
5220 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; } } while (false)
;
5221 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i] << '\n'; } } while (false)
;
5222 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariantdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: "
<< Invariant << '\n'; } } while (false)
5223 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: "
<< Invariant << '\n'; } } while (false)
;
5224
5225 RU.LoopInvariantRegs = Invariant;
5226 RU.MaxLocalUsers = MaxUsages[i];
5227 RUs[i] = RU;
5228 }
5229
5230 return RUs;
5231}
5232
5233bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5234 // TODO: Cost model for emulated masked load/store is completely
5235 // broken. This hack guides the cost model to use an artificially
5236 // high enough value to practically disable vectorization with such
5237 // operations, except where previously deployed legality hack allowed
5238 // using very low cost values. This is to avoid regressions coming simply
5239 // from moving "masked load/store" check from legality to cost model.
5240 // Masked Load/Gather emulation was previously never allowed.
5241 // Limited number of Masked Store/Scatter emulation was allowed.
5242 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction")((isPredicatedInst(I) && "Expecting a scalar emulated instruction"
) ? static_cast<void> (0) : __assert_fail ("isPredicatedInst(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5242, __PRETTY_FUNCTION__))
;
5243 return isa<LoadInst>(I) ||
5244 (isa<StoreInst>(I) &&
5245 NumPredStores > NumberOfStoresToPredicate);
5246}
5247
5248void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5249 // If we aren't vectorizing the loop, or if we've already collected the
5250 // instructions to scalarize, there's nothing to do. Collection may already
5251 // have occurred if we have a user-selected VF and are now computing the
5252 // expected cost for interleaving.
5253 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5254 return;
5255
5256 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5257 // not profitable to scalarize any instructions, the presence of VF in the
5258 // map will indicate that we've analyzed it already.
5259 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5260
5261 // Find all the instructions that are scalar with predication in the loop and
5262 // determine if it would be better to not if-convert the blocks they are in.
5263 // If so, we also record the instructions to scalarize.
5264 for (BasicBlock *BB : TheLoop->blocks()) {
5265 if (!blockNeedsPredication(BB))
5266 continue;
5267 for (Instruction &I : *BB)
5268 if (isScalarWithPredication(&I)) {
5269 ScalarCostsTy ScalarCosts;
5270 // Do not apply discount logic if hacked cost is needed
5271 // for emulated masked memrefs.
5272 if (!useEmulatedMaskMemRefHack(&I) &&
5273 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5274 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5275 // Remember that BB will remain after vectorization.
5276 PredicatedBBsAfterVectorization.insert(BB);
5277 }
5278 }
5279}
5280
5281int LoopVectorizationCostModel::computePredInstDiscount(
5282 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5283 unsigned VF) {
5284 assert(!isUniformAfterVectorization(PredInst, VF) &&((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5285, __PRETTY_FUNCTION__))
5285 "Instruction marked uniform-after-vectorization will be predicated")((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5285, __PRETTY_FUNCTION__))
;
5286
5287 // Initialize the discount to zero, meaning that the scalar version and the
5288 // vector version cost the same.
5289 int Discount = 0;
5290
5291 // Holds instructions to analyze. The instructions we visit are mapped in
5292 // ScalarCosts. Those instructions are the ones that would be scalarized if
5293 // we find that the scalar version costs less.
5294 SmallVector<Instruction *, 8> Worklist;
5295
5296 // Returns true if the given instruction can be scalarized.
5297 auto canBeScalarized = [&](Instruction *I) -> bool {
5298 // We only attempt to scalarize instructions forming a single-use chain
5299 // from the original predicated block that would otherwise be vectorized.
5300 // Although not strictly necessary, we give up on instructions we know will
5301 // already be scalar to avoid traversing chains that are unlikely to be
5302 // beneficial.
5303 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5304 isScalarAfterVectorization(I, VF))
5305 return false;
5306
5307 // If the instruction is scalar with predication, it will be analyzed
5308 // separately. We ignore it within the context of PredInst.
5309 if (isScalarWithPredication(I))
5310 return false;
5311
5312 // If any of the instruction's operands are uniform after vectorization,
5313 // the instruction cannot be scalarized. This prevents, for example, a
5314 // masked load from being scalarized.
5315 //
5316 // We assume we will only emit a value for lane zero of an instruction
5317 // marked uniform after vectorization, rather than VF identical values.
5318 // Thus, if we scalarize an instruction that uses a uniform, we would
5319 // create uses of values corresponding to the lanes we aren't emitting code
5320 // for. This behavior can be changed by allowing getScalarValue to clone
5321 // the lane zero values for uniforms rather than asserting.
5322 for (Use &U : I->operands())
5323 if (auto *J = dyn_cast<Instruction>(U.get()))
5324 if (isUniformAfterVectorization(J, VF))
5325 return false;
5326
5327 // Otherwise, we can scalarize the instruction.
5328 return true;
5329 };
5330
5331 // Returns true if an operand that cannot be scalarized must be extracted
5332 // from a vector. We will account for this scalarization overhead below. Note
5333 // that the non-void predicated instructions are placed in their own blocks,
5334 // and their return values are inserted into vectors. Thus, an extract would
5335 // still be required.
5336 auto needsExtract = [&](Instruction *I) -> bool {
5337 return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
5338 };
5339
5340 // Compute the expected cost discount from scalarizing the entire expression
5341 // feeding the predicated instruction. We currently only consider expressions
5342 // that are single-use instruction chains.
5343 Worklist.push_back(PredInst);
5344 while (!Worklist.empty()) {
5345 Instruction *I = Worklist.pop_back_val();
5346
5347 // If we've already analyzed the instruction, there's nothing to do.
5348 if (ScalarCosts.find(I) != ScalarCosts.end())
5349 continue;
5350
5351 // Compute the cost of the vector instruction. Note that this cost already
5352 // includes the scalarization overhead of the predicated instruction.
5353 unsigned VectorCost = getInstructionCost(I, VF).first;
5354
5355 // Compute the cost of the scalarized instruction. This cost is the cost of
5356 // the instruction as if it wasn't if-converted and instead remained in the
5357 // predicated block. We will scale this cost by block probability after
5358 // computing the scalarization overhead.
5359 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5360
5361 // Compute the scalarization overhead of needed insertelement instructions
5362 // and phi nodes.
5363 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5364 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5365 true, false);
5366 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5367 }
5368
5369 // Compute the scalarization overhead of needed extractelement
5370 // instructions. For each of the instruction's operands, if the operand can
5371 // be scalarized, add it to the worklist; otherwise, account for the
5372 // overhead.
5373 for (Use &U : I->operands())
5374 if (auto *J = dyn_cast<Instruction>(U.get())) {
5375 assert(VectorType::isValidElementType(J->getType()) &&((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5376, __PRETTY_FUNCTION__))
5376 "Instruction has non-scalar type")((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5376, __PRETTY_FUNCTION__))
;
5377 if (canBeScalarized(J))
5378 Worklist.push_back(J);
5379 else if (needsExtract(J))
5380 ScalarCost += TTI.getScalarizationOverhead(
5381 ToVectorTy(J->getType(),VF), false, true);
5382 }
5383
5384 // Scale the total scalar cost by block probability.
5385 ScalarCost /= getReciprocalPredBlockProb();
5386
5387 // Compute the discount. A non-negative discount means the vector version
5388 // of the instruction costs more, and scalarizing would be beneficial.
5389 Discount += VectorCost - ScalarCost;
5390 ScalarCosts[I] = ScalarCost;
5391 }
5392
5393 return Discount;
5394}
5395
5396LoopVectorizationCostModel::VectorizationCostTy
5397LoopVectorizationCostModel::expectedCost(unsigned VF) {
5398 VectorizationCostTy Cost;
5399
5400 // For each block.
5401 for (BasicBlock *BB : TheLoop->blocks()) {
5402 VectorizationCostTy BlockCost;
5403
5404 // For each instruction in the old loop.
5405 for (Instruction &I : BB->instructionsWithoutDebug()) {
5406 // Skip ignored values.
5407 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5408 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5409 continue;
5410
5411 VectorizationCostTy C = getInstructionCost(&I, VF);
5412
5413 // Check if we should override the cost.
5414 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5415 C.first = ForceTargetInstructionCost;
5416
5417 BlockCost.first += C.first;
5418 BlockCost.second |= C.second;
5419 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
5420 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
5421 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
5422 }
5423
5424 // If we are vectorizing a predicated block, it will have been
5425 // if-converted. This means that the block's instructions (aside from
5426 // stores and instructions that may divide by zero) will now be
5427 // unconditionally executed. For the scalar case, we may not always execute
5428 // the predicated block. Thus, scale the block's cost by the probability of
5429 // executing it.
5430 if (VF == 1 && blockNeedsPredication(BB))
5431 BlockCost.first /= getReciprocalPredBlockProb();
5432
5433 Cost.first += BlockCost.first;
5434 Cost.second |= BlockCost.second;
5435 }
5436
5437 return Cost;
5438}
5439
5440/// Gets Address Access SCEV after verifying that the access pattern
5441/// is loop invariant except the induction variable dependence.
5442///
5443/// This SCEV can be sent to the Target in order to estimate the address
5444/// calculation cost.
5445static const SCEV *getAddressAccessSCEV(
5446 Value *Ptr,
5447 LoopVectorizationLegality *Legal,
5448 PredicatedScalarEvolution &PSE,
5449 const Loop *TheLoop) {
5450
5451 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5452 if (!Gep)
5453 return nullptr;
5454
5455 // We are looking for a gep with all loop invariant indices except for one
5456 // which should be an induction variable.
5457 auto SE = PSE.getSE();
5458 unsigned NumOperands = Gep->getNumOperands();
5459 for (unsigned i = 1; i < NumOperands; ++i) {
5460 Value *Opd = Gep->getOperand(i);
5461 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5462 !Legal->isInductionVariable(Opd))
5463 return nullptr;
5464 }
5465
5466 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5467 return PSE.getSCEV(Ptr);
5468}
5469
5470static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5471 return Legal->hasStride(I->getOperand(0)) ||
5472 Legal->hasStride(I->getOperand(1));
5473}
5474
5475unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5476 unsigned VF) {
5477 assert(VF > 1 && "Scalarization cost of instruction implies vectorization.")((VF > 1 && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5477, __PRETTY_FUNCTION__))
;
5478 Type *ValTy = getMemInstValueType(I);
5479 auto SE = PSE.getSE();
5480
5481 unsigned Alignment = getLoadStoreAlignment(I);
5482 unsigned AS = getLoadStoreAddressSpace(I);
5483 Value *Ptr = getLoadStorePointerOperand(I);
5484 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5485
5486 // Figure out whether the access is strided and get the stride value
5487 // if it's known in compile time
5488 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5489
5490 // Get the cost of the scalar memory instruction and address computation.
5491 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5492
5493 // Don't pass *I here, since it is scalar but will actually be part of a
5494 // vectorized loop where the user of it is a vectorized instruction.
5495 Cost += VF *
5496 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5497 AS);
5498
5499 // Get the overhead of the extractelement and insertelement instructions
5500 // we might create due to scalarization.
5501 Cost += getScalarizationOverhead(I, VF);
5502
5503 // If we have a predicated store, it may not be executed for each vector
5504 // lane. Scale the cost by the probability of executing the predicated
5505 // block.
5506 if (isPredicatedInst(I)) {
5507 Cost /= getReciprocalPredBlockProb();
5508
5509 if (useEmulatedMaskMemRefHack(I))
5510 // Artificially setting to a high enough value to practically disable
5511 // vectorization with such operations.
5512 Cost = 3000000;
5513 }
5514
5515 return Cost;
5516}
5517
5518unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5519 unsigned VF) {
5520 Type *ValTy = getMemInstValueType(I);
5521 Type *VectorTy = ToVectorTy(ValTy, VF);
5522 unsigned Alignment = getLoadStoreAlignment(I);
5523 Value *Ptr = getLoadStorePointerOperand(I);
5524 unsigned AS = getLoadStoreAddressSpace(I);
5525 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5526
5527 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5528, __PRETTY_FUNCTION__))
5528 "Stride should be 1 or -1 for consecutive memory access")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5528, __PRETTY_FUNCTION__))
;
5529 unsigned Cost = 0;
5530 if (Legal->isMaskRequired(I))
5531 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5532 else
5533 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5534
5535 bool Reverse = ConsecutiveStride < 0;
5536 if (Reverse)
5537 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5538 return Cost;
5539}
5540
5541unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5542 unsigned VF) {
5543 Type *ValTy = getMemInstValueType(I);
5544 Type *VectorTy = ToVectorTy(ValTy, VF);
5545 unsigned Alignment = getLoadStoreAlignment(I);
5546 unsigned AS = getLoadStoreAddressSpace(I);
5547 if (isa<LoadInst>(I)) {
5548 return TTI.getAddressComputationCost(ValTy) +
5549 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5550 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5551 }
5552 StoreInst *SI = cast<StoreInst>(I);
5553
5554 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5555 return TTI.getAddressComputationCost(ValTy) +
5556 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5557 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5558 Instruction::ExtractElement,
5559 VectorTy, VF - 1));
5560}
5561
5562unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5563 unsigned VF) {
5564 Type *ValTy = getMemInstValueType(I);
5565 Type *VectorTy = ToVectorTy(ValTy, VF);
5566 unsigned Alignment = getLoadStoreAlignment(I);
5567 Value *Ptr = getLoadStorePointerOperand(I);
5568
5569 return TTI.getAddressComputationCost(VectorTy) +
5570 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5571 Legal->isMaskRequired(I), Alignment);
5572}
5573
5574unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5575 unsigned VF) {
5576 Type *ValTy = getMemInstValueType(I);
5577 Type *VectorTy = ToVectorTy(ValTy, VF);
5578 unsigned AS = getLoadStoreAddressSpace(I);
5579
5580 auto Group = getInterleavedAccessGroup(I);
5581 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5581, __PRETTY_FUNCTION__))
;
5582
5583 unsigned InterleaveFactor = Group->getFactor();
5584 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5585
5586 // Holds the indices of existing members in an interleaved load group.
5587 // An interleaved store group doesn't need this as it doesn't allow gaps.
5588 SmallVector<unsigned, 4> Indices;
5589 if (isa<LoadInst>(I)) {
5590 for (unsigned i = 0; i < InterleaveFactor; i++)
5591 if (Group->getMember(i))
5592 Indices.push_back(i);
5593 }
5594
5595 // Calculate the cost of the whole interleaved group.
5596 bool UseMaskForGaps =
5597 Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5598 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5599 I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5600 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5601
5602 if (Group->isReverse()) {
5603 // TODO: Add support for reversed masked interleaved access.
5604 assert(!Legal->isMaskRequired(I) &&((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5605, __PRETTY_FUNCTION__))
5605 "Reverse masked interleaved access not supported.")((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5605, __PRETTY_FUNCTION__))
;
5606 Cost += Group->getNumMembers() *
5607 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5608 }
5609 return Cost;
5610}
5611
5612unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5613 unsigned VF) {
5614 // Calculate scalar cost only. Vectorization cost should be ready at this
5615 // moment.
5616 if (VF == 1) {
5617 Type *ValTy = getMemInstValueType(I);
5618 unsigned Alignment = getLoadStoreAlignment(I);
5619 unsigned AS = getLoadStoreAddressSpace(I);
5620
5621 return TTI.getAddressComputationCost(ValTy) +
5622 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5623 }
5624 return getWideningCost(I, VF);
5625}
5626
5627LoopVectorizationCostModel::VectorizationCostTy
5628LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5629 // If we know that this instruction will remain uniform, check the cost of
5630 // the scalar version.
5631 if (isUniformAfterVectorization(I, VF))
5632 VF = 1;
5633
5634 if (VF > 1 && isProfitableToScalarize(I, VF))
5635 return VectorizationCostTy(InstsToScalarize[VF][I], false);
5636
5637 // Forced scalars do not have any scalarization overhead.
5638 auto ForcedScalar = ForcedScalars.find(VF);
5639 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5640 auto InstSet = ForcedScalar->second;
5641 if (InstSet.find(I) != InstSet.end())
5642 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5643 }
5644
5645 Type *VectorTy;
5646 unsigned C = getInstructionCost(I, VF, VectorTy);
5647
5648 bool TypeNotScalarized =
5649 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5650 return VectorizationCostTy(C, TypeNotScalarized);
5651}
5652
5653unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5654 unsigned VF) {
5655
5656 if (VF == 1)
5657 return 0;
5658
5659 unsigned Cost = 0;
5660 Type *RetTy = ToVectorTy(I->getType(), VF);
5661 if (!RetTy->isVoidTy() &&
5662 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5663 Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5664
5665 // Some targets keep addresses scalar.
5666 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5667 return Cost;
5668
5669 if (CallInst *CI = dyn_cast<CallInst>(I)) {
5670 SmallVector<const Value *, 4> Operands(CI->arg_operands());
5671 Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
5672 } else if (!isa<StoreInst>(I) ||
5673 !TTI.supportsEfficientVectorElementLoadStore()) {
5674 SmallVector<const Value *, 4> Operands(I->operand_values());
5675 Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
5676 }
5677
5678 return Cost;
5679}
5680
5681void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5682 if (VF == 1)
5683 return;
5684 NumPredStores = 0;
5685 for (BasicBlock *BB : TheLoop->blocks()) {
5686 // For each instruction in the old loop.
5687 for (Instruction &I : *BB) {
5688 Value *Ptr = getLoadStorePointerOperand(&I);
5689 if (!Ptr)
5690 continue;
5691
5692 // TODO: We should generate better code and update the cost model for
5693 // predicated uniform stores. Today they are treated as any other
5694 // predicated store (see added test cases in
5695 // invariant-store-vectorization.ll).
5696 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5697 NumPredStores++;
5698
5699 if (Legal->isUniform(Ptr) &&
5700 // Conditional loads and stores should be scalarized and predicated.
5701 // isScalarWithPredication cannot be used here since masked
5702 // gather/scatters are not considered scalar with predication.
5703 !Legal->blockNeedsPredication(I.getParent())) {
5704 // TODO: Avoid replicating loads and stores instead of
5705 // relying on instcombine to remove them.
5706 // Load: Scalar load + broadcast
5707 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5708 unsigned Cost = getUniformMemOpCost(&I, VF);
5709 setWideningDecision(&I, VF, CM_Scalarize, Cost);
5710 continue;
5711 }
5712
5713 // We assume that widening is the best solution when possible.
5714 if (memoryInstructionCanBeWidened(&I, VF)) {
5715 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5716 int ConsecutiveStride =
5717 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5718 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5719, __PRETTY_FUNCTION__))
5719 "Expected consecutive stride.")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5719, __PRETTY_FUNCTION__))
;
5720 InstWidening Decision =
5721 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5722 setWideningDecision(&I, VF, Decision, Cost);
5723 continue;
5724 }
5725
5726 // Choose between Interleaving, Gather/Scatter or Scalarization.
5727 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5728 unsigned NumAccesses = 1;
5729 if (isAccessInterleaved(&I)) {
5730 auto Group = getInterleavedAccessGroup(&I);
5731 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5731, __PRETTY_FUNCTION__))
;
5732
5733 // Make one decision for the whole group.
5734 if (getWideningDecision(&I, VF) != CM_Unknown)
5735 continue;
5736
5737 NumAccesses = Group->getNumMembers();
5738 if (interleavedAccessCanBeWidened(&I, VF))
5739 InterleaveCost = getInterleaveGroupCost(&I, VF);
5740 }
5741
5742 unsigned GatherScatterCost =
5743 isLegalGatherOrScatter(&I)
5744 ? getGatherScatterCost(&I, VF) * NumAccesses
5745 : std::numeric_limits<unsigned>::max();
5746
5747 unsigned ScalarizationCost =
5748 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5749
5750 // Choose better solution for the current VF,
5751 // write down this decision and use it during vectorization.
5752 unsigned Cost;
5753 InstWidening Decision;
5754 if (InterleaveCost <= GatherScatterCost &&
5755 InterleaveCost < ScalarizationCost) {
5756 Decision = CM_Interleave;
5757 Cost = InterleaveCost;
5758 } else if (GatherScatterCost < ScalarizationCost) {
5759 Decision = CM_GatherScatter;
5760 Cost = GatherScatterCost;
5761 } else {
5762 Decision = CM_Scalarize;
5763 Cost = ScalarizationCost;
5764 }
5765 // If the instructions belongs to an interleave group, the whole group
5766 // receives the same decision. The whole group receives the cost, but
5767 // the cost will actually be assigned to one instruction.
5768 if (auto Group = getInterleavedAccessGroup(&I))
5769 setWideningDecision(Group, VF, Decision, Cost);
5770 else
5771 setWideningDecision(&I, VF, Decision, Cost);
5772 }
5773 }
5774
5775 // Make sure that any load of address and any other address computation
5776 // remains scalar unless there is gather/scatter support. This avoids
5777 // inevitable extracts into address registers, and also has the benefit of
5778 // activating LSR more, since that pass can't optimize vectorized
5779 // addresses.
5780 if (TTI.prefersVectorizedAddressing())
5781 return;
5782
5783 // Start with all scalar pointer uses.
5784 SmallPtrSet<Instruction *, 8> AddrDefs;
5785 for (BasicBlock *BB : TheLoop->blocks())
5786 for (Instruction &I : *BB) {
5787 Instruction *PtrDef =
5788 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5789 if (PtrDef && TheLoop->contains(PtrDef) &&
5790 getWideningDecision(&I, VF) != CM_GatherScatter)
5791 AddrDefs.insert(PtrDef);
5792 }
5793
5794 // Add all instructions used to generate the addresses.
5795 SmallVector<Instruction *, 4> Worklist;
5796 for (auto *I : AddrDefs)
5797 Worklist.push_back(I);
5798 while (!Worklist.empty()) {
5799 Instruction *I = Worklist.pop_back_val();
5800 for (auto &Op : I->operands())
5801 if (auto *InstOp = dyn_cast<Instruction>(Op))
5802 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5803 AddrDefs.insert(InstOp).second)
5804 Worklist.push_back(InstOp);
5805 }
5806
5807 for (auto *I : AddrDefs) {
5808 if (isa<LoadInst>(I)) {
5809 // Setting the desired widening decision should ideally be handled in
5810 // by cost functions, but since this involves the task of finding out
5811 // if the loaded register is involved in an address computation, it is
5812 // instead changed here when we know this is the case.
5813 InstWidening Decision = getWideningDecision(I, VF);
5814 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5815 // Scalarize a widened load of address.
5816 setWideningDecision(I, VF, CM_Scalarize,
5817 (VF * getMemoryInstructionCost(I, 1)));
5818 else if (auto Group = getInterleavedAccessGroup(I)) {
5819 // Scalarize an interleave group of address loads.
5820 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5821 if (Instruction *Member = Group->getMember(I))
5822 setWideningDecision(Member, VF, CM_Scalarize,
5823 (VF * getMemoryInstructionCost(Member, 1)));
5824 }
5825 }
5826 } else
5827 // Make sure I gets scalarized and a cost estimate without
5828 // scalarization overhead.
5829 ForcedScalars[VF].insert(I);
5830 }
5831}
5832
5833unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5834 unsigned VF,
5835 Type *&VectorTy) {
5836 Type *RetTy = I->getType();
5837 if (canTruncateToMinimalBitwidth(I, VF))
5838 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5839 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5840 auto SE = PSE.getSE();
5841
5842 // TODO: We need to estimate the cost of intrinsic calls.
5843 switch (I->getOpcode()) {
5844 case Instruction::GetElementPtr:
5845 // We mark this instruction as zero-cost because the cost of GEPs in
5846 // vectorized code depends on whether the corresponding memory instruction
5847 // is scalarized or not. Therefore, we handle GEPs with the memory
5848 // instruction cost.
5849 return 0;
5850 case Instruction::Br: {
5851 // In cases of scalarized and predicated instructions, there will be VF
5852 // predicated blocks in the vectorized loop. Each branch around these
5853 // blocks requires also an extract of its vector compare i1 element.
5854 bool ScalarPredicatedBB = false;
5855 BranchInst *BI = cast<BranchInst>(I);
5856 if (VF > 1 && BI->isConditional() &&
5857 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5858 PredicatedBBsAfterVectorization.end() ||
5859 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5860 PredicatedBBsAfterVectorization.end()))
5861 ScalarPredicatedBB = true;
5862
5863 if (ScalarPredicatedBB) {
5864 // Return cost for branches around scalarized and predicated blocks.
5865 Type *Vec_i1Ty =
5866 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5867 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5868 (TTI.getCFInstrCost(Instruction::Br) * VF));
5869 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5870 // The back-edge branch will remain, as will all scalar branches.
5871 return TTI.getCFInstrCost(Instruction::Br);
5872 else
5873 // This branch will be eliminated by if-conversion.
5874 return 0;
5875 // Note: We currently assume zero cost for an unconditional branch inside
5876 // a predicated block since it will become a fall-through, although we
5877 // may decide in the future to call TTI for all branches.
5878 }
5879 case Instruction::PHI: {
5880 auto *Phi = cast<PHINode>(I);
5881
5882 // First-order recurrences are replaced by vector shuffles inside the loop.
5883 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
5884 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
5885 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
5886 VectorTy, VF - 1, VectorType::get(RetTy, 1));
5887
5888 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5889 // converted into select instructions. We require N - 1 selects per phi
5890 // node, where N is the number of incoming values.
5891 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
5892 return (Phi->getNumIncomingValues() - 1) *
5893 TTI.getCmpSelInstrCost(
5894 Instruction::Select, ToVectorTy(Phi->getType(), VF),
5895 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
5896
5897 return TTI.getCFInstrCost(Instruction::PHI);
5898 }
5899 case Instruction::UDiv:
5900 case Instruction::SDiv:
5901 case Instruction::URem:
5902 case Instruction::SRem:
5903 // If we have a predicated instruction, it may not be executed for each
5904 // vector lane. Get the scalarization cost and scale this amount by the
5905 // probability of executing the predicated block. If the instruction is not
5906 // predicated, we fall through to the next case.
5907 if (VF > 1 && isScalarWithPredication(I)) {
5908 unsigned Cost = 0;
5909
5910 // These instructions have a non-void type, so account for the phi nodes
5911 // that we will create. This cost is likely to be zero. The phi node
5912 // cost, if any, should be scaled by the block probability because it
5913 // models a copy at the end of each predicated block.
5914 Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
5915
5916 // The cost of the non-predicated instruction.
5917 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
5918
5919 // The cost of insertelement and extractelement instructions needed for
5920 // scalarization.
5921 Cost += getScalarizationOverhead(I, VF);
5922
5923 // Scale the cost by the probability of executing the predicated blocks.
5924 // This assumes the predicated block for each vector lane is equally
5925 // likely.
5926 return Cost / getReciprocalPredBlockProb();
5927 }
5928 LLVM_FALLTHROUGH[[clang::fallthrough]];
5929 case Instruction::Add:
5930 case Instruction::FAdd:
5931 case Instruction::Sub:
5932 case Instruction::FSub:
5933 case Instruction::Mul:
5934 case Instruction::FMul:
5935 case Instruction::FDiv:
5936 case Instruction::FRem:
5937 case Instruction::Shl:
5938 case Instruction::LShr:
5939 case Instruction::AShr:
5940 case Instruction::And:
5941 case Instruction::Or:
5942 case Instruction::Xor: {
5943 // Since we will replace the stride by 1 the multiplication should go away.
5944 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5945 return 0;
5946 // Certain instructions can be cheaper to vectorize if they have a constant
5947 // second vector operand. One example of this are shifts on x86.
5948 Value *Op2 = I->getOperand(1);
5949 TargetTransformInfo::OperandValueProperties Op2VP;
5950 TargetTransformInfo::OperandValueKind Op2VK =
5951 TTI.getOperandInfo(Op2, Op2VP);
5952 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
5953 Op2VK = TargetTransformInfo::OK_UniformValue;
5954
5955 SmallVector<const Value *, 4> Operands(I->operand_values());
5956 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5957 return N * TTI.getArithmeticInstrCost(
5958 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
5959 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
5960 }
5961 case Instruction::Select: {
5962 SelectInst *SI = cast<SelectInst>(I);
5963 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5964 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5965 Type *CondTy = SI->getCondition()->getType();
5966 if (!ScalarCond)
5967 CondTy = VectorType::get(CondTy, VF);
5968
5969 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
5970 }
5971 case Instruction::ICmp:
5972 case Instruction::FCmp: {
5973 Type *ValTy = I->getOperand(0)->getType();
5974 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
5975 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5976 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
5977 VectorTy = ToVectorTy(ValTy, VF);
5978 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
5979 }
5980 case Instruction::Store:
5981 case Instruction::Load: {
5982 unsigned Width = VF;
5983 if (Width > 1) {
5984 InstWidening Decision = getWideningDecision(I, Width);
5985 assert(Decision != CM_Unknown &&((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5986, __PRETTY_FUNCTION__))
5986 "CM decision should be taken at this point")((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5986, __PRETTY_FUNCTION__))
;
5987 if (Decision == CM_Scalarize)
5988 Width = 1;
5989 }
5990 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
5991 return getMemoryInstructionCost(I, VF);
5992 }
5993 case Instruction::ZExt:
5994 case Instruction::SExt:
5995 case Instruction::FPToUI:
5996 case Instruction::FPToSI:
5997 case Instruction::FPExt:
5998 case Instruction::PtrToInt:
5999 case Instruction::IntToPtr:
6000 case Instruction::SIToFP:
6001 case Instruction::UIToFP:
6002 case Instruction::Trunc:
6003 case Instruction::FPTrunc:
6004 case Instruction::BitCast: {
6005 // We optimize the truncation of induction variables having constant
6006 // integer steps. The cost of these truncations is the same as the scalar
6007 // operation.
6008 if (isOptimizableIVTruncate(I, VF)) {
6009 auto *Trunc = cast<TruncInst>(I);
6010 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6011 Trunc->getSrcTy(), Trunc);
6012 }
6013
6014 Type *SrcScalarTy = I->getOperand(0)->getType();
6015 Type *SrcVecTy =
6016 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6017 if (canTruncateToMinimalBitwidth(I, VF)) {
6018 // This cast is going to be shrunk. This may remove the cast or it might
6019 // turn it into slightly different cast. For example, if MinBW == 16,
6020 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6021 //
6022 // Calculate the modified src and dest types.
6023 Type *MinVecTy = VectorTy;
6024 if (I->getOpcode() == Instruction::Trunc) {
6025 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6026 VectorTy =
6027 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6028 } else if (I->getOpcode() == Instruction::ZExt ||
6029 I->getOpcode() == Instruction::SExt) {
6030 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6031 VectorTy =
6032 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6033 }
6034 }
6035
6036 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6037 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6038 }
6039 case Instruction::Call: {
6040 bool NeedToScalarize;
6041 CallInst *CI = cast<CallInst>(I);
6042 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6043 if (getVectorIntrinsicIDForCall(CI, TLI))
6044 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6045 return CallCost;
6046 }
6047 default:
6048 // The cost of executing VF copies of the scalar instruction. This opcode
6049 // is unknown. Assume that it is the same as 'mul'.
6050 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6051 getScalarizationOverhead(I, VF);
6052 } // end of switch.
6053}
6054
6055char LoopVectorize::ID = 0;
6056
6057static const char lv_name[] = "Loop Vectorization";
6058
6059INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
6060INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6061INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
6062INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6063INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
6064INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6065INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
6066INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6067INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6068INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
6069INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
6070INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6071INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6072INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
6073INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
6074
6075namespace llvm {
6076
6077Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6078
6079Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6080 bool VectorizeOnlyWhenForced) {
6081 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6082}
6083
6084} // end namespace llvm
6085
6086bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6087 // Check if the pointer operand of a load or store instruction is
6088 // consecutive.
6089 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6090 return Legal->isConsecutivePtr(Ptr);
6091 return false;
6092}
6093
6094void LoopVectorizationCostModel::collectValuesToIgnore() {
6095 // Ignore ephemeral values.
6096 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6097
6098 // Ignore type-promoting instructions we identified during reduction
6099 // detection.
6100 for (auto &Reduction : *Legal->getReductionVars()) {
6101 RecurrenceDescriptor &RedDes = Reduction.second;
6102 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6103 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6104 }
6105 // Ignore type-casting instructions we identified during induction
6106 // detection.
6107 for (auto &Induction : *Legal->getInductionVars()) {
6108 InductionDescriptor &IndDes = Induction.second;
6109 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6110 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6111 }
6112}
6113
6114// TODO: we could return a pair of values that specify the max VF and
6115// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6116// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6117// doesn't have a cost model that can choose which plan to execute if
6118// more than one is generated.
6119static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6120 LoopVectorizationCostModel &CM) {
6121 unsigned WidestType;
6122 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6123 return WidestVectorRegBits / WidestType;
6124}
6125
6126VectorizationFactor
6127LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6128 unsigned UserVF) {
6129 unsigned VF = UserVF;
6130 // Outer loop handling: They may require CFG and instruction level
6131 // transformations before even evaluating whether vectorization is profitable.
6132 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6133 // the vectorization pipeline.
6134 if (!OrigLoop->empty()) {
6135 // If the user doesn't provide a vectorization factor, determine a
6136 // reasonable one.
6137 if (!UserVF) {
6138 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6139 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
6140
6141 // Make sure we have a VF > 1 for stress testing.
6142 if (VPlanBuildStressTest && VF < 2) {
6143 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
6144 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
6145 VF = 4;
6146 }
6147 }
6148 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6148, __PRETTY_FUNCTION__))
;
6149 assert(isPowerOf2_32(VF) && "VF needs to be a power of two")((isPowerOf2_32(VF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6149, __PRETTY_FUNCTION__))
;
6150 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false)
6151 << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false)
;
6152 buildVPlans(VF, VF);
6153
6154 // For VPlan build stress testing, we bail out after VPlan construction.
6155 if (VPlanBuildStressTest)
6156 return VectorizationFactor::Disabled();
6157
6158 return {VF, 0};
6159 }
6160
6161 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6162 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6163 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
6164 return VectorizationFactor::Disabled();
6165}
6166
6167Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
6168 unsigned UserVF) {
6169 assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6169, __PRETTY_FUNCTION__))
;
6170 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6171 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6172 return None;
6173
6174 // Invalidate interleave groups if all blocks of loop will be predicated.
6175 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6176 !useMaskedInterleavedAccesses(*TTI)) {
6177 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6178 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6179 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6180 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
6181 CM.InterleaveInfo.reset();
6182 }
6183
6184 if (UserVF) {
6185 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (false)
;
6186 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6186, __PRETTY_FUNCTION__))
;
6187 // Collect the instructions (and their associated costs) that will be more
6188 // profitable to scalarize.
6189 CM.selectUserVectorizationFactor(UserVF);
6190 buildVPlansWithVPRecipes(UserVF, UserVF);
6191 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
6192 return {{UserVF, 0}};
6193 }
6194
6195 unsigned MaxVF = MaybeMaxVF.getValue();
6196 assert(MaxVF != 0 && "MaxVF is zero.")((MaxVF != 0 && "MaxVF is zero.") ? static_cast<void
> (0) : __assert_fail ("MaxVF != 0 && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6196, __PRETTY_FUNCTION__))
;
6197
6198 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6199 // Collect Uniform and Scalar instructions after vectorization with VF.
6200 CM.collectUniformsAndScalars(VF);
6201
6202 // Collect the instructions (and their associated costs) that will be more
6203 // profitable to scalarize.
6204 if (VF > 1)
6205 CM.collectInstsToScalarize(VF);
6206 }
6207
6208 buildVPlansWithVPRecipes(1, MaxVF);
6209 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
6210 if (MaxVF == 1)
6211 return VectorizationFactor::Disabled();
6212
6213 // Select the optimal vectorization factor.
6214 return CM.selectVectorizationFactor(MaxVF);
6215}
6216
6217void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6218 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
6219 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
;
6220 BestVF = VF;
6221 BestUF = UF;
6222
6223 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6224 return !Plan->hasVF(VF);
6225 });
6226 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")((VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6226, __PRETTY_FUNCTION__))
;
6227}
6228
6229void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6230 DominatorTree *DT) {
6231 // Perform the actual loop transformation.
6232
6233 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6234 VPCallbackILV CallbackILV(ILV);
6235
6236 VPTransformState State{BestVF, BestUF, LI,
6237 DT, ILV.Builder, ILV.VectorLoopValueMap,
6238 &ILV, CallbackILV};
6239 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6240 State.TripCount = ILV.getOrCreateTripCount(nullptr);
6241
6242 //===------------------------------------------------===//
6243 //
6244 // Notice: any optimization or new instruction that go
6245 // into the code below should also be implemented in
6246 // the cost-model.
6247 //
6248 //===------------------------------------------------===//
6249
6250 // 2. Copy and widen instructions from the old loop into the new loop.
6251 assert(VPlans.size() == 1 && "Not a single VPlan to execute.")((VPlans.size() == 1 && "Not a single VPlan to execute."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6251, __PRETTY_FUNCTION__))
;
6252 VPlans.front()->execute(&State);
6253
6254 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6255 // predication, updating analyses.
6256 ILV.fixVectorizedLoop();
6257}
6258
6259void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6260 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6261 BasicBlock *Latch = OrigLoop->getLoopLatch();
6262
6263 // We create new control-flow for the vectorized loop, so the original
6264 // condition will be dead after vectorization if it's only used by the
6265 // branch.
6266 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6267 if (Cmp && Cmp->hasOneUse())
6268 DeadInstructions.insert(Cmp);
6269
6270 // We create new "steps" for induction variable updates to which the original
6271 // induction variables map. An original update instruction will be dead if
6272 // all its users except the induction variable are dead.
6273 for (auto &Induction : *Legal->getInductionVars()) {
6274 PHINode *Ind = Induction.first;
6275 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6276 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6277 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6278 DeadInstructions.end();
6279 }))
6280 DeadInstructions.insert(IndUpdate);
6281
6282 // We record as "Dead" also the type-casting instructions we had identified
6283 // during induction analysis. We don't need any handling for them in the
6284 // vectorized loop because we have proven that, under a proper runtime
6285 // test guarding the vectorized loop, the value of the phi, and the casted
6286 // value of the phi, are the same. The last instruction in this casting chain
6287 // will get its scalar/vector/widened def from the scalar/vector/widened def
6288 // of the respective phi node. Any other casts in the induction def-use chain
6289 // have no other uses outside the phi update chain, and will be ignored.
6290 InductionDescriptor &IndDes = Induction.second;
6291 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6292 DeadInstructions.insert(Casts.begin(), Casts.end());
6293 }
6294}
6295
6296Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6297
6298Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6299
6300Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6301 Instruction::BinaryOps BinOp) {
6302 // When unrolling and the VF is 1, we only need to add a simple scalar.
6303 Type *Ty = Val->getType();
6304 assert(!Ty->isVectorTy() && "Val must be a scalar")((!Ty->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6304, __PRETTY_FUNCTION__))
;
6305
6306 if (Ty->isFloatingPointTy()) {
6307 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6308
6309 // Floating point operations had to be 'fast' to enable the unrolling.
6310 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6311 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6312 }
6313 Constant *C = ConstantInt::get(Ty, StartIdx);
6314 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6315}
6316
6317static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6318 SmallVector<Metadata *, 4> MDs;
6319 // Reserve first location for self reference to the LoopID metadata node.
6320 MDs.push_back(nullptr);
6321 bool IsUnrollMetadata = false;
6322 MDNode *LoopID = L->getLoopID();
6323 if (LoopID) {
6324 // First find existing loop unrolling disable metadata.
6325 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6326 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6327 if (MD) {
6328 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6329 IsUnrollMetadata =
6330 S && S->getString().startswith("llvm.loop.unroll.disable");
6331 }
6332 MDs.push_back(LoopID->getOperand(i));
6333 }
6334 }
6335
6336 if (!IsUnrollMetadata) {
6337 // Add runtime unroll disable metadata.
6338 LLVMContext &Context = L->getHeader()->getContext();
6339 SmallVector<Metadata *, 1> DisableOperands;
6340 DisableOperands.push_back(
6341 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6342 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6343 MDs.push_back(DisableNode);
6344 MDNode *NewLoopID = MDNode::get(Context, MDs);
6345 // Set operand 0 to refer to the loop id itself.
6346 NewLoopID->replaceOperandWith(0, NewLoopID);
6347 L->setLoopID(NewLoopID);
6348 }
6349}
6350
6351bool LoopVectorizationPlanner::getDecisionAndClampRange(
6352 const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6353 assert(Range.End > Range.Start && "Trying to test an empty VF range.")((Range.End > Range.Start && "Trying to test an empty VF range."
) ? static_cast<void> (0) : __assert_fail ("Range.End > Range.Start && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6353, __PRETTY_FUNCTION__))
;
6354 bool PredicateAtRangeStart = Predicate(Range.Start);
6355
6356 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6357 if (Predicate(TmpVF) != PredicateAtRangeStart) {
6358 Range.End = TmpVF;
6359 break;
6360 }
6361
6362 return PredicateAtRangeStart;
6363}
6364
6365/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6366/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6367/// of VF's starting at a given VF and extending it as much as possible. Each
6368/// vectorization decision can potentially shorten this sub-range during
6369/// buildVPlan().
6370void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6371 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6372 VFRange SubRange = {VF, MaxVF + 1};
6373 VPlans.push_back(buildVPlan(SubRange));
6374 VF = SubRange.End;
6375 }
6376}
6377
6378VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6379 VPlanPtr &Plan) {
6380 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")((is_contained(predecessors(Dst), Src) && "Invalid edge"
) ? static_cast<void> (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6380, __PRETTY_FUNCTION__))
;
6381
6382 // Look for cached value.
6383 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6384 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6385 if (ECEntryIt != EdgeMaskCache.end())
6386 return ECEntryIt->second;
6387
6388 VPValue *SrcMask = createBlockInMask(Src, Plan);
6389
6390 // The terminator has to be a branch inst!
6391 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6392 assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6392, __PRETTY_FUNCTION__))
;
6393
6394 if (!BI->isConditional())
6395 return EdgeMaskCache[Edge] = SrcMask;
6396
6397 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6398 assert(EdgeMask && "No Edge Mask found for condition")((EdgeMask && "No Edge Mask found for condition") ? static_cast
<void> (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6398, __PRETTY_FUNCTION__))
;
6399
6400 if (BI->getSuccessor(0) != Dst)
6401 EdgeMask = Builder.createNot(EdgeMask);
6402
6403 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6404 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6405
6406 return EdgeMaskCache[Edge] = EdgeMask;
6407}
6408
6409VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6410 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6410, __PRETTY_FUNCTION__))
;
6411
6412 // Look for cached value.
6413 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6414 if (BCEntryIt != BlockMaskCache.end())
6415 return BCEntryIt->second;
6416
6417 // All-one mask is modelled as no-mask following the convention for masked
6418 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6419 VPValue *BlockMask = nullptr;
6420
6421 if (OrigLoop->getHeader() == BB) {
6422 if (!CM.blockNeedsPredication(BB))
6423 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6424
6425 // Introduce the early-exit compare IV <= BTC to form header block mask.
6426 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6427 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6428 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6429 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6430 return BlockMaskCache[BB] = BlockMask;
6431 }
6432
6433 // This is the block mask. We OR all incoming edges.
6434 for (auto *Predecessor : predecessors(BB)) {
6435 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6436 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6437 return BlockMaskCache[BB] = EdgeMask;
6438
6439 if (!BlockMask) { // BlockMask has its initialized nullptr value.
6440 BlockMask = EdgeMask;
6441 continue;
6442 }
6443
6444 BlockMask = Builder.createOr(BlockMask, EdgeMask);
6445 }
6446
6447 return BlockMaskCache[BB] = BlockMask;
6448}
6449
6450VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6451 VFRange &Range,
6452 VPlanPtr &Plan) {
6453 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6454 if (!IG)
6455 return nullptr;
6456
6457 // Now check if IG is relevant for VF's in the given range.
6458 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6459 return [=](unsigned VF) -> bool {
6460 return (VF >= 2 && // Query is illegal for VF == 1
6461 CM.getWideningDecision(I, VF) ==
6462 LoopVectorizationCostModel::CM_Interleave);
6463 };
6464 };
6465 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6466 return nullptr;
6467
6468 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6469 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6470 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6471 assert(I == IG->getInsertPos() &&((I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"
) ? static_cast<void> (0) : __assert_fail ("I == IG->getInsertPos() && \"Generating a recipe for an adjunct member of an interleave group\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6472, __PRETTY_FUNCTION__))
6472 "Generating a recipe for an adjunct member of an interleave group")((I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"
) ? static_cast<void> (0) : __assert_fail ("I == IG->getInsertPos() && \"Generating a recipe for an adjunct member of an interleave group\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6472, __PRETTY_FUNCTION__))
;
6473
6474 VPValue *Mask = nullptr;
6475 if (Legal->isMaskRequired(I))
6476 Mask = createBlockInMask(I->getParent(), Plan);
6477
6478 return new VPInterleaveRecipe(IG, Mask);
6479}
6480
6481VPWidenMemoryInstructionRecipe *
6482VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6483 VPlanPtr &Plan) {
6484 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6485 return nullptr;
6486
6487 auto willWiden = [&](unsigned VF) -> bool {
6488 if (VF == 1)
6489 return false;
6490 if (CM.isScalarAfterVectorization(I, VF) ||
6491 CM.isProfitableToScalarize(I, VF))
6492 return false;
6493 LoopVectorizationCostModel::InstWidening Decision =
6494 CM.getWideningDecision(I, VF);
6495 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6496, __PRETTY_FUNCTION__))
6496 "CM decision should be taken at this point.")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6496, __PRETTY_FUNCTION__))
;
6497 assert(Decision != LoopVectorizationCostModel::CM_Interleave &&((Decision != LoopVectorizationCostModel::CM_Interleave &&
"Interleave memory opportunity should be caught earlier.") ?
static_cast<void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Interleave && \"Interleave memory opportunity should be caught earlier.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6498, __PRETTY_FUNCTION__))
6498 "Interleave memory opportunity should be caught earlier.")((Decision != LoopVectorizationCostModel::CM_Interleave &&
"Interleave memory opportunity should be caught earlier.") ?
static_cast<void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Interleave && \"Interleave memory opportunity should be caught earlier.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6498, __PRETTY_FUNCTION__))
;
6499 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6500 };
6501
6502 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6503 return nullptr;
6504
6505 VPValue *Mask = nullptr;
6506 if (Legal->isMaskRequired(I))
6507 Mask = createBlockInMask(I->getParent(), Plan);
6508
6509 return new VPWidenMemoryInstructionRecipe(*I, Mask);
6510}
6511
6512VPWidenIntOrFpInductionRecipe *
6513VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6514 if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6515 // Check if this is an integer or fp induction. If so, build the recipe that
6516 // produces its scalar and vector values.
6517 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6518 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6519 II.getKind() == InductionDescriptor::IK_FpInduction)
6520 return new VPWidenIntOrFpInductionRecipe(Phi);
6521
6522 return nullptr;
6523 }
6524
6525 // Optimize the special case where the source is a constant integer
6526 // induction variable. Notice that we can only optimize the 'trunc' case
6527 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6528 // (c) other casts depend on pointer size.
6529
6530 // Determine whether \p K is a truncation based on an induction variable that
6531 // can be optimized.
6532 auto isOptimizableIVTruncate =
6533 [&](Instruction *K) -> std::function<bool(unsigned)> {
6534 return
6535 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6536 };
6537
6538 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6539 isOptimizableIVTruncate(I), Range))
6540 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6541 cast<TruncInst>(I));
6542 return nullptr;
6543}
6544
6545VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6546 PHINode *Phi = dyn_cast<PHINode>(I);
6547 if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6548 return nullptr;
6549
6550 // We know that all PHIs in non-header blocks are converted into selects, so
6551 // we don't have to worry about the insertion order and we can just use the
6552 // builder. At this point we generate the predication tree. There may be
6553 // duplications since this is a simple recursive scan, but future
6554 // optimizations will clean it up.
6555
6556 SmallVector<VPValue *, 2> Masks;
6557 unsigned NumIncoming = Phi->getNumIncomingValues();
6558 for (unsigned In = 0; In < NumIncoming; In++) {
6559 VPValue *EdgeMask =
6560 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6561 assert((EdgeMask || NumIncoming == 1) &&(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6562, __PRETTY_FUNCTION__))
6562 "Multiple predecessors with one having a full mask")(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6562, __PRETTY_FUNCTION__))
;
6563 if (EdgeMask)
6564 Masks.push_back(EdgeMask);
6565 }
6566 return new VPBlendRecipe(Phi, Masks);
6567}
6568
6569bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6570 VFRange &Range) {
6571
6572 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6573 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6574
6575 if (IsPredicated)
6576 return false;
6577
6578 auto IsVectorizableOpcode = [](unsigned Opcode) {
6579 switch (Opcode) {
6580 case Instruction::Add:
6581 case Instruction::And:
6582 case Instruction::AShr:
6583 case Instruction::BitCast:
6584 case Instruction::Br:
6585 case Instruction::Call:
6586 case Instruction::FAdd:
6587 case Instruction::FCmp:
6588 case Instruction::FDiv:
6589 case Instruction::FMul:
6590 case Instruction::FPExt:
6591 case Instruction::FPToSI:
6592 case Instruction::FPToUI:
6593 case Instruction::FPTrunc:
6594 case Instruction::FRem:
6595 case Instruction::FSub:
6596 case Instruction::GetElementPtr:
6597 case Instruction::ICmp:
6598 case Instruction::IntToPtr:
6599 case Instruction::Load:
6600 case Instruction::LShr:
6601 case Instruction::Mul:
6602 case Instruction::Or:
6603 case Instruction::PHI:
6604 case Instruction::PtrToInt:
6605 case Instruction::SDiv:
6606 case Instruction::Select:
6607 case Instruction::SExt:
6608 case Instruction::Shl:
6609 case Instruction::SIToFP:
6610 case Instruction::SRem:
6611 case Instruction::Store:
6612 case Instruction::Sub:
6613 case Instruction::Trunc:
6614 case Instruction::UDiv:
6615 case Instruction::UIToFP:
6616 case Instruction::URem:
6617 case Instruction::Xor:
6618 case Instruction::ZExt:
6619 return true;
6620 }
6621 return false;
6622 };
6623
6624 if (!IsVectorizableOpcode(I->getOpcode()))
6625 return false;
6626
6627 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6628 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6629 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6630 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6631 return false;
6632 }
6633
6634 auto willWiden = [&](unsigned VF) -> bool {
6635 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6636 CM.isProfitableToScalarize(I, VF)))
6637 return false;
6638 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6639 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6640 // The following case may be scalarized depending on the VF.
6641 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6642 // version of the instruction.
6643 // Is it beneficial to perform intrinsic call compared to lib call?
6644 bool NeedToScalarize;
6645 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6646 bool UseVectorIntrinsic =
6647 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6648 return UseVectorIntrinsic || !NeedToScalarize;
6649 }
6650 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6651 assert(CM.getWideningDecision(I, VF) ==((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6653, __PRETTY_FUNCTION__))
6652 LoopVectorizationCostModel::CM_Scalarize &&((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6653, __PRETTY_FUNCTION__))
6653 "Memory widening decisions should have been taken care by now")((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6653, __PRETTY_FUNCTION__))
;
6654 return false;
6655 }
6656 return true;
6657 };
6658
6659 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6660 return false;
6661
6662 // Success: widen this instruction. We optimize the common case where
6663 // consecutive instructions can be represented by a single recipe.
6664 if (!VPBB->empty()) {
6665 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6666 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6667 return true;
6668 }
6669
6670 VPBB->appendRecipe(new VPWidenRecipe(I));
6671 return true;
6672}
6673
6674VPBasicBlock *VPRecipeBuilder::handleReplication(
6675 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6676 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6677 VPlanPtr &Plan) {
6678 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6679 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6680 Range);
6681
6682 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6683 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6684
6685 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6686
6687 // Find if I uses a predicated instruction. If so, it will use its scalar
6688 // value. Avoid hoisting the insert-element which packs the scalar value into
6689 // a vector value, as that happens iff all users use the vector value.
6690 for (auto &Op : I->operands())
6691 if (auto *PredInst = dyn_cast<Instruction>(Op))
6692 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6693 PredInst2Recipe[PredInst]->setAlsoPack(false);
6694
6695 // Finalize the recipe for Instr, first if it is not predicated.
6696 if (!IsPredicated) {
6697 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
6698 VPBB->appendRecipe(Recipe);
6699 return VPBB;
6700 }
6701 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
6702 assert(VPBB->getSuccessors().empty() &&((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6703, __PRETTY_FUNCTION__))
6703 "VPBB has successors when handling predicated replication.")((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6703, __PRETTY_FUNCTION__))
;
6704 // Record predicated instructions for above packing optimizations.
6705 PredInst2Recipe[I] = Recipe;
6706 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6707 VPBlockUtils::insertBlockAfter(Region, VPBB);
6708 auto *RegSucc = new VPBasicBlock();
6709 VPBlockUtils::insertBlockAfter(RegSucc, Region);
6710 return RegSucc;
6711}
6712
6713VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6714 VPRecipeBase *PredRecipe,
6715 VPlanPtr &Plan) {
6716 // Instructions marked for predication are replicated and placed under an
6717 // if-then construct to prevent side-effects.
6718
6719 // Generate recipes to compute the block mask for this region.
6720 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6721
6722 // Build the triangular if-then region.
6723 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6724 assert(Instr->getParent() && "Predicated instruction not in any basic block")((Instr->getParent() && "Predicated instruction not in any basic block"
) ? static_cast<void> (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6724, __PRETTY_FUNCTION__))
;
6725 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6726 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6727 auto *PHIRecipe =
6728 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6729 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6730 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6731 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6732
6733 // Note: first set Entry as region entry and then connect successors starting
6734 // from it in order, to propagate the "parent" of each VPBasicBlock.
6735 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6736 VPBlockUtils::connectBlocks(Pred, Exit);
6737
6738 return Region;
6739}
6740
6741bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6742 VPlanPtr &Plan, VPBasicBlock *VPBB) {
6743 VPRecipeBase *Recipe = nullptr;
6744 // Check if Instr should belong to an interleave memory recipe, or already
6745 // does. In the latter case Instr is irrelevant.
6746 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6747 VPBB->appendRecipe(Recipe);
6748 return true;
6749 }
6750
6751 // Check if Instr is a memory operation that should be widened.
6752 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6753 VPBB->appendRecipe(Recipe);
6754 return true;
6755 }
6756
6757 // Check if Instr should form some PHI recipe.
6758 if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6759 VPBB->appendRecipe(Recipe);
6760 return true;
6761 }
6762 if ((Recipe = tryToBlend(Instr, Plan))) {
6763 VPBB->appendRecipe(Recipe);
6764 return true;
6765 }
6766 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6767 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6768 return true;
6769 }
6770
6771 // Check if Instr is to be widened by a general VPWidenRecipe, after
6772 // having first checked for specific widening recipes that deal with
6773 // Interleave Groups, Inductions and Phi nodes.
6774 if (tryToWiden(Instr, VPBB, Range))
6775 return true;
6776
6777 return false;
6778}
6779
6780void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6781 unsigned MaxVF) {
6782 assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6782, __PRETTY_FUNCTION__))
;
6783
6784 // Collect conditions feeding internal conditional branches; they need to be
6785 // represented in VPlan for it to model masking.
6786 SmallPtrSet<Value *, 1> NeedDef;
6787
6788 auto *Latch = OrigLoop->getLoopLatch();
6789 for (BasicBlock *BB : OrigLoop->blocks()) {
6790 if (BB == Latch)
6791 continue;
6792 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6793 if (Branch && Branch->isConditional())
6794 NeedDef.insert(Branch->getCondition());
6795 }
6796
6797 // If the tail is to be folded by masking, the primary induction variable
6798 // needs to be represented in VPlan for it to model early-exit masking.
6799 if (CM.foldTailByMasking())
6800 NeedDef.insert(Legal->getPrimaryInduction());
6801
6802 // Collect instructions from the original loop that will become trivially dead
6803 // in the vectorized loop. We don't need to vectorize these instructions. For
6804 // example, original induction update instructions can become dead because we
6805 // separately emit induction "steps" when generating code for the new loop.
6806 // Similarly, we create a new latch condition when setting up the structure
6807 // of the new loop, so the old one can become dead.
6808 SmallPtrSet<Instruction *, 4> DeadInstructions;
6809 collectTriviallyDeadInstructions(DeadInstructions);
6810
6811 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6812 VFRange SubRange = {VF, MaxVF + 1};
6813 VPlans.push_back(
6814 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6815 VF = SubRange.End;
6816 }
6817}
6818
6819LoopVectorizationPlanner::VPlanPtr
6820LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6821 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6822 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6823 // Hold a mapping from predicated instructions to their recipes, in order to
6824 // fix their AlsoPack behavior if a user is determined to replicate and use a
6825 // scalar instead of vector value.
6826 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6827
6828 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6829 DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6830
6831 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6832 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6833 auto Plan = llvm::make_unique<VPlan>(VPBB);
6834
6835 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6836 // Represent values that will have defs inside VPlan.
6837 for (Value *V : NeedDef)
6838 Plan->addVPValue(V);
6839
6840 // Scan the body of the loop in a topological order to visit each basic block
6841 // after having visited its predecessor basic blocks.
6842 LoopBlocksDFS DFS(OrigLoop);
6843 DFS.perform(LI);
6844
6845 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6846 // Relevant instructions from basic block BB will be grouped into VPRecipe
6847 // ingredients and fill a new VPBasicBlock.
6848 unsigned VPBBsForBB = 0;
6849 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6850 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6851 VPBB = FirstVPBBForBB;
6852 Builder.setInsertPoint(VPBB);
6853
6854 std::vector<Instruction *> Ingredients;
6855
6856 // Organize the ingredients to vectorize from current basic block in the
6857 // right order.
6858 for (Instruction &I : BB->instructionsWithoutDebug()) {
6859 Instruction *Instr = &I;
6860
6861 // First filter out irrelevant instructions, to ensure no recipes are
6862 // built for them.
6863 if (isa<BranchInst>(Instr) ||
6864 DeadInstructions.find(Instr) != DeadInstructions.end())
6865 continue;
6866
6867 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6868 // member of the IG, do not construct any Recipe for it.
6869 const InterleaveGroup<Instruction> *IG =
6870 CM.getInterleavedAccessGroup(Instr);
6871 if (IG && Instr != IG->getInsertPos() &&
6872 Range.Start >= 2 && // Query is illegal for VF == 1
6873 CM.getWideningDecision(Instr, Range.Start) ==
6874 LoopVectorizationCostModel::CM_Interleave) {
6875 auto SinkCandidate = SinkAfterInverse.find(Instr);
6876 if (SinkCandidate != SinkAfterInverse.end())
6877 Ingredients.push_back(SinkCandidate->second);
6878 continue;
6879 }
6880
6881 // Move instructions to handle first-order recurrences, step 1: avoid
6882 // handling this instruction until after we've handled the instruction it
6883 // should follow.
6884 auto SAIt = SinkAfter.find(Instr);
6885 if (SAIt != SinkAfter.end()) {
6886 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
" to vectorize a 1st order recurrence.\n"; } } while (false)
6887 << *SAIt->seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
" to vectorize a 1st order recurrence.\n"; } } while (false)
6888 << " to vectorize a 1st order recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
" to vectorize a 1st order recurrence.\n"; } } while (false)
;
6889 SinkAfterInverse[SAIt->second] = Instr;
6890 continue;
6891 }
6892
6893 Ingredients.push_back(Instr);
6894
6895 // Move instructions to handle first-order recurrences, step 2: push the
6896 // instruction to be sunk at its insertion point.
6897 auto SAInvIt = SinkAfterInverse.find(Instr);
6898 if (SAInvIt != SinkAfterInverse.end())
6899 Ingredients.push_back(SAInvIt->second);
6900 }
6901
6902 // Introduce each ingredient into VPlan.
6903 for (Instruction *Instr : Ingredients) {
6904 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6905 continue;
6906
6907 // Otherwise, if all widening options failed, Instruction is to be
6908 // replicated. This may create a successor for VPBB.
6909 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6910 Instr, Range, VPBB, PredInst2Recipe, Plan);
6911 if (NextVPBB != VPBB) {
6912 VPBB = NextVPBB;
6913 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
6914 : "");
6915 }
6916 }
6917 }
6918
6919 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
6920 // may also be empty, such as the last one VPBB, reflecting original
6921 // basic-blocks with no recipes.
6922 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6923 assert(PreEntry->empty() && "Expecting empty pre-entry block.")((PreEntry->empty() && "Expecting empty pre-entry block."
) ? static_cast<void> (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6923, __PRETTY_FUNCTION__))
;
6924 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
6925 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
6926 delete PreEntry;
6927
6928 std::string PlanName;
6929 raw_string_ostream RSO(PlanName);
6930 unsigned VF = Range.Start;
6931 Plan->addVF(VF);
6932 RSO << "Initial VPlan for VF={" << VF;
6933 for (VF *= 2; VF < Range.End; VF *= 2) {
6934 Plan->addVF(VF);
6935 RSO << "," << VF;
6936 }
6937 RSO << "},UF>=1";
6938 RSO.flush();
6939 Plan->setName(PlanName);
6940
6941 return Plan;
6942}
6943
6944LoopVectorizationPlanner::VPlanPtr
6945LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6946 // Outer loop handling: They may require CFG and instruction level
6947 // transformations before even evaluating whether vectorization is profitable.
6948 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6949 // the vectorization pipeline.
6950 assert(!OrigLoop->empty())((!OrigLoop->empty()) ? static_cast<void> (0) : __assert_fail
("!OrigLoop->empty()", "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6950, __PRETTY_FUNCTION__))
;
6951 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6951, __PRETTY_FUNCTION__))
;
6952
6953 // Create new empty VPlan
6954 auto Plan = llvm::make_unique<VPlan>();
6955
6956 // Build hierarchical CFG
6957 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
6958 HCFGBuilder.buildHierarchicalCFG();
6959
6960 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
6961 Plan->addVF(VF);
6962
6963 if (EnableVPlanPredication) {
6964 VPlanPredicator VPP(*Plan);
6965 VPP.predicate();
6966
6967 // Avoid running transformation to recipes until masked code generation in
6968 // VPlan-native path is in place.
6969 return Plan;
6970 }
6971
6972 SmallPtrSet<Instruction *, 1> DeadInstructions;
6973 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
6974 Plan, Legal->getInductionVars(), DeadInstructions);
6975
6976 return Plan;
6977}
6978
6979Value* LoopVectorizationPlanner::VPCallbackILV::
6980getOrCreateVectorValues(Value *V, unsigned Part) {
6981 return ILV.getOrCreateVectorValue(V, Part);
6982}
6983
6984void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
6985 O << " +\n"
6986 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
6987 IG->getInsertPos()->printAsOperand(O, false);
6988 if (User) {
6989 O << ", ";
6990 User->getOperand(0)->printAsOperand(O);
6991 }
6992 O << "\\l\"";
6993 for (unsigned i = 0; i < IG->getFactor(); ++i)
6994 if (Instruction *I = IG->getMember(i))
6995 O << " +\n"
6996 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
6997}
6998
6999void VPWidenRecipe::execute(VPTransformState &State) {
7000 for (auto &Instr : make_range(Begin, End))
7001 State.ILV->widenInstruction(Instr);
7002}
7003
7004void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7005 assert(!State.Instance && "Int or FP induction being replicated.")((!State.Instance && "Int or FP induction being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7005, __PRETTY_FUNCTION__))
;
7006 State.ILV->widenIntOrFpInduction(IV, Trunc);
7007}
7008
7009void VPWidenPHIRecipe::execute(VPTransformState &State) {
7010 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7011}
7012
7013void VPBlendRecipe::execute(VPTransformState &State) {
7014 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7015 // We know that all PHIs in non-header blocks are converted into
7016 // selects, so we don't have to worry about the insertion order and we
7017 // can just use the builder.
7018 // At this point we generate the predication tree. There may be
7019 // duplications since this is a simple recursive scan, but future
7020 // optimizations will clean it up.
7021
7022 unsigned NumIncoming = Phi->getNumIncomingValues();
7023
7024 assert((User || NumIncoming == 1) &&(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7025, __PRETTY_FUNCTION__))
7025 "Multiple predecessors with predecessors having a full mask")(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7025, __PRETTY_FUNCTION__))
;
7026 // Generate a sequence of selects of the form:
7027 // SELECT(Mask3, In3,
7028 // SELECT(Mask2, In2,
7029 // ( ...)))
7030 InnerLoopVectorizer::VectorParts Entry(State.UF);
7031 for (unsigned In = 0; In < NumIncoming; ++In) {
7032 for (unsigned Part = 0; Part < State.UF; ++Part) {
7033 // We might have single edge PHIs (blocks) - use an identity
7034 // 'select' for the first PHI operand.
7035 Value *In0 =
7036 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7037 if (In == 0)
7038 Entry[Part] = In0; // Initialize with the first incoming value.
7039 else {
7040 // Select between the current value and the previous incoming edge
7041 // based on the incoming mask.
7042 Value *Cond = State.get(User->getOperand(In), Part);
7043 Entry[Part] =
7044 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7045 }
7046 }
7047 }
7048 for (unsigned Part = 0; Part < State.UF; ++Part)
7049 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7050}
7051
7052void VPInterleaveRecipe::execute(VPTransformState &State) {
7053 assert(!State.Instance && "Interleave group being replicated.")((!State.Instance && "Interleave group being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7053, __PRETTY_FUNCTION__))
;
7054 if (!User)
7055 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7056
7057 // Last (and currently only) operand is a mask.
7058 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7059 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7060 for (unsigned Part = 0; Part < State.UF; ++Part)
7061 MaskValues[Part] = State.get(Mask, Part);
7062 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7063}
7064
7065void VPReplicateRecipe::execute(VPTransformState &State) {
7066 if (State.Instance) { // Generate a single instance.
7067 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7068 // Insert scalar instance packing it into a vector.
7069 if (AlsoPack && State.VF > 1) {
7070 // If we're constructing lane 0, initialize to start from undef.
7071 if (State.Instance->Lane == 0) {
7072 Value *Undef =
7073 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7074 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7075 }
7076 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7077 }
7078 return;
7079 }
7080
7081 // Generate scalar instances for all VF lanes of all UF parts, unless the
7082 // instruction is uniform inwhich case generate only the first lane for each
7083 // of the UF parts.
7084 unsigned EndLane = IsUniform ? 1 : State.VF;
7085 for (unsigned Part = 0; Part < State.UF; ++Part)
7086 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7087 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7088}
7089
7090void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7091 assert(State.Instance && "Branch on Mask works only on single instance.")((State.Instance && "Branch on Mask works only on single instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7091, __PRETTY_FUNCTION__))
;
7092
7093 unsigned Part = State.Instance->Part;
7094 unsigned Lane = State.Instance->Lane;
7095
7096 Value *ConditionBit = nullptr;
7097 if (!User) // Block in mask is all-one.
7098 ConditionBit = State.Builder.getTrue();
7099 else {
7100 VPValue *BlockInMask = User->getOperand(0);
7101 ConditionBit = State.get(BlockInMask, Part);
7102 if (ConditionBit->getType()->isVectorTy())
7103 ConditionBit = State.Builder.CreateExtractElement(
7104 ConditionBit, State.Builder.getInt32(Lane));
7105 }
7106
7107 // Replace the temporary unreachable terminator with a new conditional branch,
7108 // whose two destinations will be set later when they are created.
7109 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7110 assert(isa<UnreachableInst>(CurrentTerminator) &&((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7111, __PRETTY_FUNCTION__))
7111 "Expected to replace unreachable terminator with conditional branch.")((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7111, __PRETTY_FUNCTION__))
;
7112 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7113 CondBr->setSuccessor(0, nullptr);
7114 ReplaceInstWithInst(CurrentTerminator, CondBr);
7115}
7116
7117void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7118 assert(State.Instance && "Predicated instruction PHI works per instance.")((State.Instance && "Predicated instruction PHI works per instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7118, __PRETTY_FUNCTION__))
;
7119 Instruction *ScalarPredInst = cast<Instruction>(
7120 State.ValueMap.getScalarValue(PredInst, *State.Instance));
7121 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7122 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7123 assert(PredicatingBB && "Predicated block has no single predecessor.")((PredicatingBB && "Predicated block has no single predecessor."
) ? static_cast<void> (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7123, __PRETTY_FUNCTION__))
;
7124
7125 // By current pack/unpack logic we need to generate only a single phi node: if
7126 // a vector value for the predicated instruction exists at this point it means
7127 // the instruction has vector users only, and a phi for the vector value is
7128 // needed. In this case the recipe of the predicated instruction is marked to
7129 // also do that packing, thereby "hoisting" the insert-element sequence.
7130 // Otherwise, a phi node for the scalar value is needed.
7131 unsigned Part = State.Instance->Part;
7132 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7133 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7134 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7135 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7136 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7137 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7138 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7139 } else {
7140 Type *PredInstType = PredInst->getType();
7141 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7142 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7143 Phi->addIncoming(ScalarPredInst, PredicatedBB);
7144 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7145 }
7146}
7147
7148void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7149 if (!User)
7150 return State.ILV->vectorizeMemoryInstruction(&Instr);
7151
7152 // Last (and currently only) operand is a mask.
7153 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7154 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7155 for (unsigned Part = 0; Part < State.UF; ++Part)
7156 MaskValues[Part] = State.get(Mask, Part);
7157 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7158}
7159
7160// Process the loop in the VPlan-native vectorization path. This path builds
7161// VPlan upfront in the vectorization pipeline, which allows to apply
7162// VPlan-to-VPlan transformations from the very beginning without modifying the
7163// input LLVM IR.
7164static bool processLoopInVPlanNativePath(
7165 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7166 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7167 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7168 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7169 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7170
7171 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")((EnableVPlanNativePath && "VPlan-native path is disabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7171, __PRETTY_FUNCTION__))
;
7172 Function *F = L->getHeader()->getParent();
7173 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7174 LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7175 &Hints, IAI);
7176 // Use the planner for outer loop vectorization.
7177 // TODO: CM is not used at this point inside the planner. Turn CM into an
7178 // optional argument if we don't need it in the future.
7179 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7180
7181 // Get user vectorization factor.
7182 const unsigned UserVF = Hints.getWidth();
7183
7184 // Check the function attributes and profiles to find out if this function
7185 // should be optimized for size.
7186 bool OptForSize =
7187 Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7188 (F->hasOptSize() ||
7189 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7190
7191 // Plan how to best vectorize, return the best VF and its cost.
7192 const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7193
7194 // If we are stress testing VPlan builds, do not attempt to generate vector
7195 // code. Masked vector code generation support will follow soon.
7196 // Also, do not attempt to vectorize if no vector code will be produced.
7197 if (VPlanBuildStressTest || EnableVPlanPredication ||
7198 VectorizationFactor::Disabled() == VF)
7199 return false;
7200
7201 LVP.setBestPlan(VF.Width, 1);
7202
7203 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7204 &CM);
7205 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
7206 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
7207 LVP.executePlan(LB, DT);
7208
7209 // Mark the loop as already vectorized to avoid vectorizing again.
7210 Hints.setAlreadyVectorized();
7211
7212 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false)
;
7213 return true;
7214}
7215
7216bool LoopVectorizePass::processLoop(Loop *L) {
7217 assert((EnableVPlanNativePath || L->empty()) &&(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7218, __PRETTY_FUNCTION__))
1
Assuming the condition is false
2
Assuming the condition is true
3
'?' condition is true
7218 "VPlan-native path is not enabled. Only process inner loops.")(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7218, __PRETTY_FUNCTION__))
;
7219
7220#ifndef NDEBUG
7221 const std::string DebugLocStr = getDebugLocString(L);
7222#endif /* NDEBUG */
7223
7224 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
4
Assuming 'DebugFlag' is 0
5
Loop condition is false. Exiting loop
7225 << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
7226 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
;
7227
7228 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7229
7230 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
6
Assuming 'DebugFlag' is 0
7
Loop condition is false. Exiting loop
7231 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7232 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7233 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7234 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7235 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7236 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7237 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7238 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7239 << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
;
7240
7241 // Function containing loop
7242 Function *F = L->getHeader()->getParent();
7243
7244 // Looking at the diagnostic output is the only way to determine if a loop
7245 // was vectorized (other than looking at the IR or machine code), so it
7246 // is important to generate an optimization remark for each loop. Most of
7247 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7248 // generated as OptimizationRemark and OptimizationRemarkMissed are
7249 // less verbose reporting vectorized loops and unvectorized loops that may
7250 // benefit from vectorization, respectively.
7251
7252 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8
Assuming the condition is false
9
Taking false branch
7253 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
7254 return false;
7255 }
7256
7257 PredicatedScalarEvolution PSE(*SE, *L);
7258
7259 // Check if it is legal to vectorize the loop.
7260 LoopVectorizationRequirements Requirements(*ORE);
7261 LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
7262 &Requirements, &Hints, DB, AC);
7263 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10
Assuming the condition is false
11
Taking false branch
7264 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
7265 Hints.emitRemarkWithHints();
7266 return false;
7267 }
7268
7269 // Check the function attributes and profiles to find out if this function
7270 // should be optimized for size.
7271 bool OptForSize =
7272 Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
12
Assuming the condition is false
7273 (F->hasOptSize() ||
7274 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7275
7276 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7277 // here. They may require CFG and instruction level transformations before
7278 // even evaluating whether vectorization is profitable. Since we cannot modify
7279 // the incoming IR, we need to build VPlan upfront in the vectorization
7280 // pipeline.
7281 if (!L->empty())
13
Assuming the condition is false
14
Taking false branch
7282 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7283 ORE, BFI, PSI, Hints);
7284
7285 assert(L->empty() && "Inner loop expected.")((L->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("L->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7285, __PRETTY_FUNCTION__))
;
15
Assuming the condition is true
16
'?' condition is true
7286 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7287 // count by optimizing for size, to minimize overheads.
7288 // Prefer constant trip counts over profile data, over upper bound estimate.
7289 unsigned ExpectedTC = 0;
7290 bool HasExpectedTC = false;
7291 if (const SCEVConstant *ConstExits =
17
Taking false branch
7292 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7293 const APInt &ExitsCount = ConstExits->getAPInt();
7294 // We are interested in small values for ExpectedTC. Skip over those that
7295 // can't fit an unsigned.
7296 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7297 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7298 HasExpectedTC = true;
7299 }
7300 }
7301 // ExpectedTC may be large because it's bound by a variable. Check
7302 // profiling information to validate we should vectorize.
7303 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
18
Assuming the condition is false
19
Taking false branch
7304 auto EstimatedTC = getLoopEstimatedTripCount(L);
7305 if (EstimatedTC) {
7306 ExpectedTC = *EstimatedTC;
7307 HasExpectedTC = true;
7308 }
7309 }
7310 if (!HasExpectedTC) {
20
Taking true branch
7311 ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7312 HasExpectedTC = (ExpectedTC > 0);
21
Assuming 'ExpectedTC' is <= 0
7313 }
7314
7315 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7316 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
7317 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
7318 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
7319 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7320 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
7321 else {
7322 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
7323 // Loops with a very small trip count are considered for vectorization
7324 // under OptForSize, thereby making sure the cost of their loop body is
7325 // dominant, free of runtime guards and scalar iteration overheads.
7326 OptForSize = true;
7327 }
7328 }
7329
7330 // Check the function attributes to see if implicit floats are allowed.
7331 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7332 // an integer loop and the vector instructions selected are purely integer
7333 // vector instructions?
7334 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
22
Assuming the condition is false
23
Taking false branch
7335 LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n"; } } while (false)
7336 "attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n"; } } while (false)
;
7337 ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7338 "NoImplicitFloat", L)
7339 << "loop not vectorized due to NoImplicitFloat attribute");
7340 Hints.emitRemarkWithHints();
7341 return false;
7342 }
7343
7344 // Check if the target supports potentially unsafe FP vectorization.
7345 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7346 // for the target we're vectorizing for, to make sure none of the
7347 // additional fp-math flags can help.
7348 if (Hints.isPotentiallyUnsafe() &&
7349 TTI->isFPVectorizationPotentiallyUnsafe()) {
7350 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"
; } } while (false)
7351 dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"
; } } while (false)
;
7352 ORE->emit(
7353 createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7354 << "loop not vectorized due to unsafe FP support.");
7355 Hints.emitRemarkWithHints();
7356 return false;
7357 }
7358
7359 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7360 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7361
7362 // If an override option has been passed in for interleaved accesses, use it.
7363 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
24
Assuming the condition is false
25
Taking false branch
7364 UseInterleaved = EnableInterleavedMemAccesses;
7365
7366 // Analyze interleaved memory accesses.
7367 if (UseInterleaved) {
26
Assuming 'UseInterleaved' is 0
27
Taking false branch
7368 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7369 }
7370
7371 // Use the cost model.
7372 LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7373 &Hints, IAI);
7374 CM.collectValuesToIgnore();
7375
7376 // Use the planner for vectorization.
7377 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7378
7379 // Get user vectorization factor.
7380 unsigned UserVF = Hints.getWidth();
7381
7382 // Plan how to best vectorize, return the best VF and its cost.
7383 Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
7384
7385 VectorizationFactor VF = VectorizationFactor::Disabled();
7386 unsigned IC = 1;
7387 unsigned UserIC = Hints.getInterleave();
7388
7389 if (MaybeVF) {
28
Taking true branch
7390 VF = *MaybeVF;
7391 // Select the interleave count.
7392 IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
29
Calling 'LoopVectorizationCostModel::selectInterleaveCount'
7393 }
7394
7395 // Identify the diagnostic messages that should be produced.
7396 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7397 bool VectorizeLoop = true, InterleaveLoop = true;
7398 if (Requirements.doesNotMeet(F, L, Hints)) {
7399 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
7400 "requirements.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
;
7401 Hints.emitRemarkWithHints();
7402 return false;
7403 }
7404
7405 if (VF.Width == 1) {
7406 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
7407 VecDiagMsg = std::make_pair(
7408 "VectorizationNotBeneficial",
7409 "the cost-model indicates that vectorization is not beneficial");
7410 VectorizeLoop = false;
7411 }
7412
7413 if (!MaybeVF && UserIC > 1) {
7414 // Tell the user interleaving was avoided up-front, despite being explicitly
7415 // requested.
7416 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
7417 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
7418 IntDiagMsg = std::make_pair(
7419 "InterleavingAvoided",
7420 "Ignoring UserIC, because interleaving was avoided up front");
7421 InterleaveLoop = false;
7422 } else if (IC == 1 && UserIC <= 1) {
7423 // Tell the user interleaving is not beneficial.
7424 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
7425 IntDiagMsg = std::make_pair(
7426 "InterleavingNotBeneficial",
7427 "the cost-model indicates that interleaving is not beneficial");
7428 InterleaveLoop = false;
7429 if (UserIC == 1) {
7430 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7431 IntDiagMsg.second +=
7432 " and is explicitly disabled or interleave count is set to 1";
7433 }
7434 } else if (IC > 1 && UserIC == 1) {
7435 // Tell the user interleaving is beneficial, but it explicitly disabled.
7436 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
7437 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
7438 IntDiagMsg = std::make_pair(
7439 "InterleavingBeneficialButDisabled",
7440 "the cost-model indicates that interleaving is beneficial "
7441 "but is explicitly disabled or interleave count is set to 1");
7442 InterleaveLoop = false;
7443 }
7444
7445 // Override IC if user provided an interleave count.
7446 IC = UserIC > 0 ? UserIC : IC;
7447
7448 // Emit diagnostic messages, if any.
7449 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7450 if (!VectorizeLoop && !InterleaveLoop) {
7451 // Do not vectorize or interleaving the loop.
7452 ORE->emit([&]() {
7453 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7454 L->getStartLoc(), L->getHeader())
7455 << VecDiagMsg.second;
7456 });
7457 ORE->emit([&]() {
7458 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
7459 L->getStartLoc(), L->getHeader())
7460 << IntDiagMsg.second;
7461 });
7462 return false;
7463 } else if (!VectorizeLoop && InterleaveLoop) {
7464 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
7465 ORE->emit([&]() {
7466 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7467 L->getStartLoc(), L->getHeader())
7468 << VecDiagMsg.second;
7469 });
7470 } else if (VectorizeLoop && !InterleaveLoop) {
7471 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
7472 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
7473 ORE->emit([&]() {
7474 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
7475 L->getStartLoc(), L->getHeader())
7476 << IntDiagMsg.second;
7477 });
7478 } else if (VectorizeLoop && InterleaveLoop) {
7479 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
7480 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
7481 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
7482 }
7483
7484 LVP.setBestPlan(VF.Width, IC);
7485
7486 using namespace ore;
7487 bool DisableRuntimeUnroll = false;
7488 MDNode *OrigLoopID = L->getLoopID();
7489
7490 if (!VectorizeLoop) {
7491 assert(IC > 1 && "interleave count should not be 1 or 0")((IC > 1 && "interleave count should not be 1 or 0"
) ? static_cast<void> (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-9~svn361301/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7491, __PRETTY_FUNCTION__))
;
7492 // If we decided that it is not legal to vectorize the loop, then
7493 // interleave it.
7494 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7495 &CM);
7496 LVP.executePlan(Unroller, DT);
7497
7498 ORE->emit([&]() {
7499 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
7500 L->getHeader())
7501 << "interleaved loop (interleaved count: "
7502 << NV("InterleaveCount", IC) << ")";
7503 });
7504 } else {
7505 // If we decided that it is *legal* to vectorize the loop, then do it.
7506 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7507 &LVL, &CM);
7508 LVP.executePlan(LB, DT);
7509 ++LoopsVectorized;
7510
7511 // Add metadata to disable runtime unrolling a scalar loop when there are
7512 // no runtime checks about strides and memory. A scalar loop that is
7513 // rarely used is not worth unrolling.
7514 if (!LB.areSafetyChecksAdded())
7515 DisableRuntimeUnroll = true;
7516
7517 // Report the vectorization decision.
7518 ORE->emit([&]() {
7519 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
7520 L->getHeader())
7521 << "vectorized loop (vectorization width: "
7522 << NV("VectorizationFactor", VF.Width)
7523 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7524 });
7525 }
7526
7527 Optional<MDNode *> RemainderLoopID =
7528 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7529 LLVMLoopVectorizeFollowupEpilogue});
7530 if (RemainderLoopID.hasValue()) {
7531 L->setLoopID(RemainderLoopID.getValue());
7532 } else {
7533 if (DisableRuntimeUnroll)
7534 AddRuntimeUnrollDisableMetaData(L);
7535
7536 // Mark the loop as already vectorized to avoid vectorizing again.
7537 Hints.setAlreadyVectorized();
7538 }
7539
7540 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false)
;
7541 return true;
7542}
7543
7544bool LoopVectorizePass::runImpl(
7545 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7546 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7547 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7548 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7549 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7550 SE = &SE_;
7551 LI = &LI_;
7552 TTI = &TTI_;
7553 DT = &DT_;
7554 BFI = &BFI_;
7555 TLI = TLI_;
7556 AA = &AA_;
7557 AC = &AC_;
7558 GetLAA = &GetLAA_;
7559 DB = &DB_;
7560 ORE = &ORE_;
7561 PSI = PSI_;
7562
7563 // Don't attempt if
7564 // 1. the target claims to have no vector registers, and
7565 // 2. interleaving won't help ILP.
7566 //
7567 // The second condition is necessary because, even if the target has no
7568 // vector registers, loop vectorization may still enable scalar
7569 // interleaving.
7570 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7571 return false;
7572
7573 bool Changed = false;
7574
7575 // The vectorizer requires loops to be in simplified form.
7576 // Since simplification may add new inner loops, it has to run before the
7577 // legality and profitability checks. This means running the loop vectorizer
7578 // will simplify all loops, regardless of whether anything end up being
7579 // vectorized.
7580 for (auto &L : *LI)
7581 Changed |=
7582 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7583
7584 // Build up a worklist of inner-loops to vectorize. This is necessary as
7585 // the act of vectorizing or partially unrolling a loop creates new loops
7586 // and can invalidate iterators across the loops.
7587 SmallVector<Loop *, 8> Worklist;
7588
7589 for (Loop *L : *LI)
7590 collectSupportedLoops(*L, LI, ORE, Worklist);
7591
7592 LoopsAnalyzed += Worklist.size();
7593
7594 // Now walk the identified inner loops.
7595 while (!Worklist.empty()) {
7596 Loop *L = Worklist.pop_back_val();
7597
7598 // For the inner loops we actually process, form LCSSA to simplify the
7599 // transform.
7600 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7601
7602 Changed |= processLoop(L);
7603 }
7604
7605 // Process each loop nest in the function.
7606 return Changed;
7607}
7608
7609PreservedAnalyses LoopVectorizePass::run(Function &F,
7610 FunctionAnalysisManager &AM) {
7611 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7612 auto &LI = AM.getResult<LoopAnalysis>(F);
7613 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7614 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7615 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7616 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7617 auto &AA = AM.getResult<AAManager>(F);
7618 auto &AC = AM.getResult<AssumptionAnalysis>(F);
7619 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7620 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7621 MemorySSA *MSSA = EnableMSSALoopDependency
7622 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7623 : nullptr;
7624
7625 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7626 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7627 [&](Loop &L) -> const LoopAccessInfo & {
7628 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7629 return LAM.getResult<LoopAccessAnalysis>(L, AR);
7630 };
7631 const ModuleAnalysisManager &MAM =
7632 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7633 ProfileSummaryInfo *PSI =
7634 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7635 bool Changed =
7636 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7637 if (!Changed)
7638 return PreservedAnalyses::all();
7639 PreservedAnalyses PA;
7640
7641 // We currently do not preserve loopinfo/dominator analyses with outer loop
7642 // vectorization. Until this is addressed, mark these analyses as preserved
7643 // only for non-VPlan-native path.
7644 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7645 if (!EnableVPlanNativePath) {
7646 PA.preserve<LoopAnalysis>();
7647 PA.preserve<DominatorTreeAnalysis>();
7648 }
7649 PA.preserve<BasicAA>();
7650 PA.preserve<GlobalsAA>();
7651 return PA;
7652}