Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 6767, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SetVector.h"
73#include "llvm/ADT/SmallPtrSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpander.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/TargetLibraryInfo.h"
97#include "llvm/Analysis/TargetTransformInfo.h"
98#include "llvm/Analysis/VectorUtils.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfoMetadata.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/DiagnosticInfo.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
115#include "llvm/IR/IntrinsicInst.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/LLVMContext.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/InitializePasses.h"
128#include "llvm/Pass.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138#include "llvm/Transforms/Utils/LoopSimplify.h"
139#include "llvm/Transforms/Utils/LoopUtils.h"
140#include "llvm/Transforms/Utils/LoopVersioning.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160/// @{
161/// Metadata attribute names
162static const char *const LLVMLoopVectorizeFollowupAll =
163 "llvm.loop.vectorize.followup_all";
164static const char *const LLVMLoopVectorizeFollowupVectorized =
165 "llvm.loop.vectorize.followup_vectorized";
166static const char *const LLVMLoopVectorizeFollowupEpilogue =
167 "llvm.loop.vectorize.followup_epilogue";
168/// @}
169
170STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
171STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
172
173/// Loops with a known constant trip count below this number are vectorized only
174/// if no scalar iteration overheads are incurred.
175static cl::opt<unsigned> TinyTripCountVectorThreshold(
176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177 cl::desc("Loops with a constant trip count that is smaller than this "
178 "value are vectorized only if no scalar iteration overheads "
179 "are incurred."));
180
181// Indicates that an epilogue is undesired, predication is preferred.
182// This means that the vectorizer will try to fold the loop-tail (epilogue)
183// into the loop and predicate the loop body accordingly.
184static cl::opt<bool> PreferPredicateOverEpilog(
185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186 cl::desc("Indicate that an epilogue is undesired, predication should be "
187 "used instead."));
188
189static cl::opt<bool> MaximizeBandwidth(
190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191 cl::desc("Maximize bandwidth when selecting vectorization factor which "
192 "will be determined by the smallest type in loop."));
193
194static cl::opt<bool> EnableInterleavedMemAccesses(
195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197
198/// An interleave-group may need masking if it resides in a block that needs
199/// predication, or in order to mask away gaps.
200static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203
204static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206 cl::desc("We don't interleave loops with a estimated constant trip count "
207 "below this number"));
208
209static cl::opt<unsigned> ForceTargetNumScalarRegs(
210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211 cl::desc("A flag that overrides the target's number of scalar registers."));
212
213static cl::opt<unsigned> ForceTargetNumVectorRegs(
214 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215 cl::desc("A flag that overrides the target's number of vector registers."));
216
217static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219 cl::desc("A flag that overrides the target's max interleave factor for "
220 "scalar loops."));
221
222static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224 cl::desc("A flag that overrides the target's max interleave factor for "
225 "vectorized loops."));
226
227static cl::opt<unsigned> ForceTargetInstructionCost(
228 "force-target-instruction-cost", cl::init(0), cl::Hidden,
229 cl::desc("A flag that overrides the target's expected cost for "
230 "an instruction to a single constant value. Mostly "
231 "useful for getting consistent testing."));
232
233static cl::opt<unsigned> SmallLoopCost(
234 "small-loop-cost", cl::init(20), cl::Hidden,
235 cl::desc(
236 "The cost of a loop that is considered 'small' by the interleaver."));
237
238static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240 cl::desc("Enable the use of the block frequency analysis to access PGO "
241 "heuristics minimizing code growth in cold regions and being more "
242 "aggressive in hot regions."));
243
244// Runtime interleave loops for load/store throughput.
245static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247 cl::desc(
248 "Enable runtime interleaving until load/store ports are saturated"));
249
250/// The number of stores in a loop that are allowed to need predication.
251static cl::opt<unsigned> NumberOfStoresToPredicate(
252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253 cl::desc("Max number of stores to be predicated behind an if."));
254
255static cl::opt<bool> EnableIndVarRegisterHeur(
256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257 cl::desc("Count the induction variable only once when interleaving"));
258
259static cl::opt<bool> EnableCondStoresVectorization(
260 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261 cl::desc("Enable if predication of stores during vectorization."));
262
263static cl::opt<unsigned> MaxNestedScalarReductionIC(
264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265 cl::desc("The maximum interleave count to use when interleaving a scalar "
266 "reduction in a nested loop."));
267
268cl::opt<bool> EnableVPlanNativePath(
269 "enable-vplan-native-path", cl::init(false), cl::Hidden,
270 cl::desc("Enable VPlan-native vectorization path with "
271 "support for outer loop vectorization."));
272
273// FIXME: Remove this switch once we have divergence analysis. Currently we
274// assume divergent non-backedge branches when this switch is true.
275cl::opt<bool> EnableVPlanPredication(
276 "enable-vplan-predication", cl::init(false), cl::Hidden,
277 cl::desc("Enable VPlan-native vectorization path predicator with "
278 "support for outer loop vectorization."));
279
280// This flag enables the stress testing of the VPlan H-CFG construction in the
281// VPlan-native vectorization path. It must be used in conjuction with
282// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283// verification of the H-CFGs built.
284static cl::opt<bool> VPlanBuildStressTest(
285 "vplan-build-stress-test", cl::init(false), cl::Hidden,
286 cl::desc(
287 "Build VPlan for every supported loop nest in the function and bail "
288 "out right after the build (stress test the VPlan H-CFG construction "
289 "in the VPlan-native vectorization path)."));
290
291cl::opt<bool> llvm::EnableLoopInterleaving(
292 "interleave-loops", cl::init(true), cl::Hidden,
293 cl::desc("Enable loop interleaving in Loop vectorization passes"));
294cl::opt<bool> llvm::EnableLoopVectorization(
295 "vectorize-loops", cl::init(true), cl::Hidden,
296 cl::desc("Run the Loop vectorization passes"));
297
298/// A helper function that returns the type of loaded or stored value.
299static Type *getMemInstValueType(Value *I) {
300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 301, __PRETTY_FUNCTION__))
301 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 301, __PRETTY_FUNCTION__))
;
302 if (auto *LI = dyn_cast<LoadInst>(I))
303 return LI->getType();
304 return cast<StoreInst>(I)->getValueOperand()->getType();
305}
306
307/// A helper function that returns true if the given type is irregular. The
308/// type is irregular if its allocated size doesn't equal the store size of an
309/// element of the corresponding vector type at the given vectorization factor.
310static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311 // Determine if an array of VF elements of type Ty is "bitcast compatible"
312 // with a <VF x Ty> vector.
313 if (VF > 1) {
314 auto *VectorTy = VectorType::get(Ty, VF);
315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316 }
317
318 // If the vectorization factor is one, we just check if an array of type Ty
319 // requires padding between elements.
320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321}
322
323/// A helper function that returns the reciprocal of the block probability of
324/// predicated blocks. If we return X, we are assuming the predicated block
325/// will execute once for every X iterations of the loop header.
326///
327/// TODO: We should use actual block probability here, if available. Currently,
328/// we always assume predicated blocks have a 50% chance of executing.
329static unsigned getReciprocalPredBlockProb() { return 2; }
330
331/// A helper function that adds a 'fast' flag to floating-point operations.
332static Value *addFastMathFlag(Value *V) {
333 if (isa<FPMathOperator>(V))
334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335 return V;
336}
337
338static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339 if (isa<FPMathOperator>(V))
340 cast<Instruction>(V)->setFastMathFlags(FMF);
341 return V;
342}
343
344/// A helper function that returns an integer or floating-point constant with
345/// value C.
346static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348 : ConstantFP::get(Ty, C);
349}
350
351/// Returns "best known" trip count for the specified loop \p L as defined by
352/// the following procedure:
353/// 1) Returns exact trip count if it is known.
354/// 2) Returns expected trip count according to profile data if any.
355/// 3) Returns upper bound estimate if it is known.
356/// 4) Returns None if all of the above failed.
357static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358 // Check if exact trip count is known.
359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360 return ExpectedTC;
361
362 // Check if there is an expected trip count available from profile data.
363 if (LoopVectorizeWithBlockFrequency)
364 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365 return EstimatedTC;
366
367 // Check if upper bound estimate is known.
368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369 return ExpectedTC;
370
371 return None;
372}
373
374namespace llvm {
375
376/// InnerLoopVectorizer vectorizes loops which contain only one basic
377/// block to a specified vectorization factor (VF).
378/// This class performs the widening of scalars into vectors, or multiple
379/// scalars. This class also implements the following features:
380/// * It inserts an epilogue loop for handling loops that don't have iteration
381/// counts that are known to be a multiple of the vectorization factor.
382/// * It handles the code generation for reduction variables.
383/// * Scalarization (implementation using scalars) of un-vectorizable
384/// instructions.
385/// InnerLoopVectorizer does not perform any vectorization-legality
386/// checks, and relies on the caller to check for the different legality
387/// aspects. The InnerLoopVectorizer relies on the
388/// LoopVectorizationLegality class to provide information about the induction
389/// and reduction variables that were found to a given vectorization factor.
390class InnerLoopVectorizer {
391public:
392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393 LoopInfo *LI, DominatorTree *DT,
394 const TargetLibraryInfo *TLI,
395 const TargetTransformInfo *TTI, AssumptionCache *AC,
396 OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398 LoopVectorizationCostModel *CM)
399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401 Builder(PSE.getSE()->getContext()),
402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403 virtual ~InnerLoopVectorizer() = default;
404
405 /// Create a new empty loop. Unlink the old loop and connect the new one.
406 /// Return the pre-header block of the new loop.
407 BasicBlock *createVectorizedLoopSkeleton();
408
409 /// Widen a single instruction within the innermost loop.
410 void widenInstruction(Instruction &I);
411
412 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
413 void fixVectorizedLoop();
414
415 // Return true if any runtime check is added.
416 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
417
418 /// A type for vectorized values in the new loop. Each value from the
419 /// original loop, when vectorized, is represented by UF vector values in the
420 /// new unrolled loop, where UF is the unroll factor.
421 using VectorParts = SmallVector<Value *, 2>;
422
423 /// Vectorize a single GetElementPtrInst based on information gathered and
424 /// decisions taken during planning.
425 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
426 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
427
428 /// Vectorize a single PHINode in a block. This method handles the induction
429 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
430 /// arbitrary length vectors.
431 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
432
433 /// A helper function to scalarize a single Instruction in the innermost loop.
434 /// Generates a sequence of scalar instances for each lane between \p MinLane
435 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
436 /// inclusive..
437 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
438 bool IfPredicateInstr);
439
440 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
441 /// is provided, the integer induction variable will first be truncated to
442 /// the corresponding type.
443 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
444
445 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
446 /// vector or scalar value on-demand if one is not yet available. When
447 /// vectorizing a loop, we visit the definition of an instruction before its
448 /// uses. When visiting the definition, we either vectorize or scalarize the
449 /// instruction, creating an entry for it in the corresponding map. (In some
450 /// cases, such as induction variables, we will create both vector and scalar
451 /// entries.) Then, as we encounter uses of the definition, we derive values
452 /// for each scalar or vector use unless such a value is already available.
453 /// For example, if we scalarize a definition and one of its uses is vector,
454 /// we build the required vector on-demand with an insertelement sequence
455 /// when visiting the use. Otherwise, if the use is scalar, we can use the
456 /// existing scalar definition.
457 ///
458 /// Return a value in the new loop corresponding to \p V from the original
459 /// loop at unroll index \p Part. If the value has already been vectorized,
460 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
461 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
462 /// a new vector value on-demand by inserting the scalar values into a vector
463 /// with an insertelement sequence. If the value has been neither vectorized
464 /// nor scalarized, it must be loop invariant, so we simply broadcast the
465 /// value into a vector.
466 Value *getOrCreateVectorValue(Value *V, unsigned Part);
467
468 /// Return a value in the new loop corresponding to \p V from the original
469 /// loop at unroll and vector indices \p Instance. If the value has been
470 /// vectorized but not scalarized, the necessary extractelement instruction
471 /// will be generated.
472 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
473
474 /// Construct the vector value of a scalarized value \p V one lane at a time.
475 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
476
477 /// Try to vectorize the interleaved access group that \p Instr belongs to
478 /// with the base address given in \p Addr, optionally masking the vector
479 /// operations if \p BlockInMask is non-null. Use \p State to translate given
480 /// VPValues to IR values in the vectorized loop.
481 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
482 VPValue *Addr, VPValue *BlockInMask = nullptr);
483
484 /// Vectorize Load and Store instructions with the base address given in \p
485 /// Addr, optionally masking the vector operations if \p BlockInMask is
486 /// non-null. Use \p State to translate given VPValues to IR values in the
487 /// vectorized loop.
488 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
489 VPValue *Addr,
490 VPValue *BlockInMask = nullptr);
491
492 /// Set the debug location in the builder using the debug location in
493 /// the instruction.
494 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
495
496 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
497 void fixNonInductionPHIs(void);
498
499protected:
500 friend class LoopVectorizationPlanner;
501
502 /// A small list of PHINodes.
503 using PhiVector = SmallVector<PHINode *, 4>;
504
505 /// A type for scalarized values in the new loop. Each value from the
506 /// original loop, when scalarized, is represented by UF x VF scalar values
507 /// in the new unrolled loop, where UF is the unroll factor and VF is the
508 /// vectorization factor.
509 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
510
511 /// Set up the values of the IVs correctly when exiting the vector loop.
512 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
513 Value *CountRoundDown, Value *EndValue,
514 BasicBlock *MiddleBlock);
515
516 /// Create a new induction variable inside L.
517 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
518 Value *Step, Instruction *DL);
519
520 /// Handle all cross-iteration phis in the header.
521 void fixCrossIterationPHIs();
522
523 /// Fix a first-order recurrence. This is the second phase of vectorizing
524 /// this phi node.
525 void fixFirstOrderRecurrence(PHINode *Phi);
526
527 /// Fix a reduction cross-iteration phi. This is the second phase of
528 /// vectorizing this phi node.
529 void fixReduction(PHINode *Phi);
530
531 /// Clear NSW/NUW flags from reduction instructions if necessary.
532 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
533
534 /// The Loop exit block may have single value PHI nodes with some
535 /// incoming value. While vectorizing we only handled real values
536 /// that were defined inside the loop and we should have one value for
537 /// each predecessor of its parent basic block. See PR14725.
538 void fixLCSSAPHIs();
539
540 /// Iteratively sink the scalarized operands of a predicated instruction into
541 /// the block that was created for it.
542 void sinkScalarOperands(Instruction *PredInst);
543
544 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545 /// represented as.
546 void truncateToMinimalBitwidths();
547
548 /// Create a broadcast instruction. This method generates a broadcast
549 /// instruction (shuffle) for loop invariant values and for the induction
550 /// value. If this is the induction variable then we extend it to N, N+1, ...
551 /// this is needed because each iteration in the loop corresponds to a SIMD
552 /// element.
553 virtual Value *getBroadcastInstrs(Value *V);
554
555 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
556 /// to each vector element of Val. The sequence starts at StartIndex.
557 /// \p Opcode is relevant for FP induction variable.
558 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
559 Instruction::BinaryOps Opcode =
560 Instruction::BinaryOpsEnd);
561
562 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
563 /// variable on which to base the steps, \p Step is the size of the step, and
564 /// \p EntryVal is the value from the original loop that maps to the steps.
565 /// Note that \p EntryVal doesn't have to be an induction variable - it
566 /// can also be a truncate instruction.
567 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
568 const InductionDescriptor &ID);
569
570 /// Create a vector induction phi node based on an existing scalar one. \p
571 /// EntryVal is the value from the original loop that maps to the vector phi
572 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
573 /// truncate instruction, instead of widening the original IV, we widen a
574 /// version of the IV truncated to \p EntryVal's type.
575 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
576 Value *Step, Instruction *EntryVal);
577
578 /// Returns true if an instruction \p I should be scalarized instead of
579 /// vectorized for the chosen vectorization factor.
580 bool shouldScalarizeInstruction(Instruction *I) const;
581
582 /// Returns true if we should generate a scalar version of \p IV.
583 bool needsScalarInduction(Instruction *IV) const;
584
585 /// If there is a cast involved in the induction variable \p ID, which should
586 /// be ignored in the vectorized loop body, this function records the
587 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
588 /// cast. We had already proved that the casted Phi is equal to the uncasted
589 /// Phi in the vectorized loop (under a runtime guard), and therefore
590 /// there is no need to vectorize the cast - the same value can be used in the
591 /// vector loop for both the Phi and the cast.
592 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
593 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
594 ///
595 /// \p EntryVal is the value from the original loop that maps to the vector
596 /// phi node and is used to distinguish what is the IV currently being
597 /// processed - original one (if \p EntryVal is a phi corresponding to the
598 /// original IV) or the "newly-created" one based on the proof mentioned above
599 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
600 /// latter case \p EntryVal is a TruncInst and we must not record anything for
601 /// that IV, but it's error-prone to expect callers of this routine to care
602 /// about that, hence this explicit parameter.
603 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
604 const Instruction *EntryVal,
605 Value *VectorLoopValue,
606 unsigned Part,
607 unsigned Lane = UINT_MAX(2147483647 *2U +1U));
608
609 /// Generate a shuffle sequence that will reverse the vector Vec.
610 virtual Value *reverseVector(Value *Vec);
611
612 /// Returns (and creates if needed) the original loop trip count.
613 Value *getOrCreateTripCount(Loop *NewLoop);
614
615 /// Returns (and creates if needed) the trip count of the widened loop.
616 Value *getOrCreateVectorTripCount(Loop *NewLoop);
617
618 /// Returns a bitcasted value to the requested vector type.
619 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
620 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
621 const DataLayout &DL);
622
623 /// Emit a bypass check to see if the vector trip count is zero, including if
624 /// it overflows.
625 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
626
627 /// Emit a bypass check to see if all of the SCEV assumptions we've
628 /// had to make are correct.
629 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
630
631 /// Emit bypass checks to check any memory assumptions we may have made.
632 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
633
634 /// Compute the transformed value of Index at offset StartValue using step
635 /// StepValue.
636 /// For integer induction, returns StartValue + Index * StepValue.
637 /// For pointer induction, returns StartValue[Index * StepValue].
638 /// FIXME: The newly created binary instructions should contain nsw/nuw
639 /// flags, which can be found from the original scalar operations.
640 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
641 const DataLayout &DL,
642 const InductionDescriptor &ID) const;
643
644 /// Add additional metadata to \p To that was not present on \p Orig.
645 ///
646 /// Currently this is used to add the noalias annotations based on the
647 /// inserted memchecks. Use this for instructions that are *cloned* into the
648 /// vector loop.
649 void addNewMetadata(Instruction *To, const Instruction *Orig);
650
651 /// Add metadata from one instruction to another.
652 ///
653 /// This includes both the original MDs from \p From and additional ones (\see
654 /// addNewMetadata). Use this for *newly created* instructions in the vector
655 /// loop.
656 void addMetadata(Instruction *To, Instruction *From);
657
658 /// Similar to the previous function but it adds the metadata to a
659 /// vector of instructions.
660 void addMetadata(ArrayRef<Value *> To, Instruction *From);
661
662 /// The original loop.
663 Loop *OrigLoop;
664
665 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
666 /// dynamic knowledge to simplify SCEV expressions and converts them to a
667 /// more usable form.
668 PredicatedScalarEvolution &PSE;
669
670 /// Loop Info.
671 LoopInfo *LI;
672
673 /// Dominator Tree.
674 DominatorTree *DT;
675
676 /// Alias Analysis.
677 AliasAnalysis *AA;
678
679 /// Target Library Info.
680 const TargetLibraryInfo *TLI;
681
682 /// Target Transform Info.
683 const TargetTransformInfo *TTI;
684
685 /// Assumption Cache.
686 AssumptionCache *AC;
687
688 /// Interface to emit optimization remarks.
689 OptimizationRemarkEmitter *ORE;
690
691 /// LoopVersioning. It's only set up (non-null) if memchecks were
692 /// used.
693 ///
694 /// This is currently only used to add no-alias metadata based on the
695 /// memchecks. The actually versioning is performed manually.
696 std::unique_ptr<LoopVersioning> LVer;
697
698 /// The vectorization SIMD factor to use. Each vector will have this many
699 /// vector elements.
700 unsigned VF;
701
702 /// The vectorization unroll factor to use. Each scalar is vectorized to this
703 /// many different vector instructions.
704 unsigned UF;
705
706 /// The builder that we use
707 IRBuilder<> Builder;
708
709 // --- Vectorization state ---
710
711 /// The vector-loop preheader.
712 BasicBlock *LoopVectorPreHeader;
713
714 /// The scalar-loop preheader.
715 BasicBlock *LoopScalarPreHeader;
716
717 /// Middle Block between the vector and the scalar.
718 BasicBlock *LoopMiddleBlock;
719
720 /// The ExitBlock of the scalar loop.
721 BasicBlock *LoopExitBlock;
722
723 /// The vector loop body.
724 BasicBlock *LoopVectorBody;
725
726 /// The scalar loop body.
727 BasicBlock *LoopScalarBody;
728
729 /// A list of all bypass blocks. The first block is the entry of the loop.
730 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
731
732 /// The new Induction variable which was added to the new block.
733 PHINode *Induction = nullptr;
734
735 /// The induction variable of the old basic block.
736 PHINode *OldInduction = nullptr;
737
738 /// Maps values from the original loop to their corresponding values in the
739 /// vectorized loop. A key value can map to either vector values, scalar
740 /// values or both kinds of values, depending on whether the key was
741 /// vectorized and scalarized.
742 VectorizerValueMap VectorLoopValueMap;
743
744 /// Store instructions that were predicated.
745 SmallVector<Instruction *, 4> PredicatedInstructions;
746
747 /// Trip count of the original loop.
748 Value *TripCount = nullptr;
749
750 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751 Value *VectorTripCount = nullptr;
752
753 /// The legality analysis.
754 LoopVectorizationLegality *Legal;
755
756 /// The profitablity analysis.
757 LoopVectorizationCostModel *Cost;
758
759 // Record whether runtime checks are added.
760 bool AddedSafetyChecks = false;
761
762 // Holds the end values for each induction variable. We save the end values
763 // so we can later fix-up the external users of the induction variables.
764 DenseMap<PHINode *, Value *> IVEndValues;
765
766 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767 // fixed up at the end of vector code generation.
768 SmallVector<PHINode *, 8> OrigPHIsToFix;
769};
770
771class InnerLoopUnroller : public InnerLoopVectorizer {
772public:
773 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
774 LoopInfo *LI, DominatorTree *DT,
775 const TargetLibraryInfo *TLI,
776 const TargetTransformInfo *TTI, AssumptionCache *AC,
777 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778 LoopVectorizationLegality *LVL,
779 LoopVectorizationCostModel *CM)
780 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
781 UnrollFactor, LVL, CM) {}
782
783private:
784 Value *getBroadcastInstrs(Value *V) override;
785 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
786 Instruction::BinaryOps Opcode =
787 Instruction::BinaryOpsEnd) override;
788 Value *reverseVector(Value *Vec) override;
789};
790
791} // end namespace llvm
792
793/// Look for a meaningful debug location on the instruction or it's
794/// operands.
795static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
796 if (!I)
797 return I;
798
799 DebugLoc Empty;
800 if (I->getDebugLoc() != Empty)
801 return I;
802
803 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
804 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
805 if (OpInst->getDebugLoc() != Empty)
806 return OpInst;
807 }
808
809 return I;
810}
811
812void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
813 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
814 const DILocation *DIL = Inst->getDebugLoc();
815 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
816 !isa<DbgInfoIntrinsic>(Inst)) {
817 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
818 if (NewDIL)
819 B.SetCurrentDebugLocation(NewDIL.getValue());
820 else
821 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
822 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
823 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
824 }
825 else
826 B.SetCurrentDebugLocation(DIL);
827 } else
828 B.SetCurrentDebugLocation(DebugLoc());
829}
830
831/// Write a record \p DebugMsg about vectorization failure to the debug
832/// output stream. If \p I is passed, it is an instruction that prevents
833/// vectorization.
834#ifndef NDEBUG
835static void debugVectorizationFailure(const StringRef DebugMsg,
836 Instruction *I) {
837 dbgs() << "LV: Not vectorizing: " << DebugMsg;
838 if (I != nullptr)
839 dbgs() << " " << *I;
840 else
841 dbgs() << '.';
842 dbgs() << '\n';
843}
844#endif
845
846/// Create an analysis remark that explains why vectorization failed
847///
848/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
849/// RemarkName is the identifier for the remark. If \p I is passed it is an
850/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
851/// the location of the remark. \return the remark object that can be
852/// streamed to.
853static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
854 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
855 Value *CodeRegion = TheLoop->getHeader();
856 DebugLoc DL = TheLoop->getStartLoc();
857
858 if (I) {
859 CodeRegion = I->getParent();
860 // If there is no debug location attached to the instruction, revert back to
861 // using the loop's.
862 if (I->getDebugLoc())
863 DL = I->getDebugLoc();
864 }
865
866 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
867 R << "loop not vectorized: ";
868 return R;
869}
870
871namespace llvm {
872
873void reportVectorizationFailure(const StringRef DebugMsg,
874 const StringRef OREMsg, const StringRef ORETag,
875 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
876 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
877 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
878 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
879 ORETag, TheLoop, I) << OREMsg);
880}
881
882} // end namespace llvm
883
884#ifndef NDEBUG
885/// \return string containing a file name and a line # for the given loop.
886static std::string getDebugLocString(const Loop *L) {
887 std::string Result;
888 if (L) {
889 raw_string_ostream OS(Result);
890 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
891 LoopDbgLoc.print(OS);
892 else
893 // Just print the module name.
894 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
895 OS.flush();
896 }
897 return Result;
898}
899#endif
900
901void InnerLoopVectorizer::addNewMetadata(Instruction *To,
902 const Instruction *Orig) {
903 // If the loop was versioned with memchecks, add the corresponding no-alias
904 // metadata.
905 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
906 LVer->annotateInstWithNoAlias(To, Orig);
907}
908
909void InnerLoopVectorizer::addMetadata(Instruction *To,
910 Instruction *From) {
911 propagateMetadata(To, From);
912 addNewMetadata(To, From);
913}
914
915void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
916 Instruction *From) {
917 for (Value *V : To) {
918 if (Instruction *I = dyn_cast<Instruction>(V))
919 addMetadata(I, From);
920 }
921}
922
923namespace llvm {
924
925// Loop vectorization cost-model hints how the scalar epilogue loop should be
926// lowered.
927enum ScalarEpilogueLowering {
928
929 // The default: allowing scalar epilogues.
930 CM_ScalarEpilogueAllowed,
931
932 // Vectorization with OptForSize: don't allow epilogues.
933 CM_ScalarEpilogueNotAllowedOptSize,
934
935 // A special case of vectorisation with OptForSize: loops with a very small
936 // trip count are considered for vectorization under OptForSize, thereby
937 // making sure the cost of their loop body is dominant, free of runtime
938 // guards and scalar iteration overheads.
939 CM_ScalarEpilogueNotAllowedLowTripLoop,
940
941 // Loop hint predicate indicating an epilogue is undesired.
942 CM_ScalarEpilogueNotNeededUsePredicate
943};
944
945/// LoopVectorizationCostModel - estimates the expected speedups due to
946/// vectorization.
947/// In many cases vectorization is not profitable. This can happen because of
948/// a number of reasons. In this class we mainly attempt to predict the
949/// expected speedup/slowdowns due to the supported instruction set. We use the
950/// TargetTransformInfo to query the different backends for the cost of
951/// different operations.
952class LoopVectorizationCostModel {
953public:
954 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
955 PredicatedScalarEvolution &PSE, LoopInfo *LI,
956 LoopVectorizationLegality *Legal,
957 const TargetTransformInfo &TTI,
958 const TargetLibraryInfo *TLI, DemandedBits *DB,
959 AssumptionCache *AC,
960 OptimizationRemarkEmitter *ORE, const Function *F,
961 const LoopVectorizeHints *Hints,
962 InterleavedAccessInfo &IAI)
963 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
964 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
965 Hints(Hints), InterleaveInfo(IAI) {}
966
967 /// \return An upper bound for the vectorization factor, or None if
968 /// vectorization and interleaving should be avoided up front.
969 Optional<unsigned> computeMaxVF();
970
971 /// \return True if runtime checks are required for vectorization, and false
972 /// otherwise.
973 bool runtimeChecksRequired();
974
975 /// \return The most profitable vectorization factor and the cost of that VF.
976 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
977 /// then this vectorization factor will be selected if vectorization is
978 /// possible.
979 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
980
981 /// Setup cost-based decisions for user vectorization factor.
982 void selectUserVectorizationFactor(unsigned UserVF) {
983 collectUniformsAndScalars(UserVF);
984 collectInstsToScalarize(UserVF);
985 }
986
987 /// \return The size (in bits) of the smallest and widest types in the code
988 /// that needs to be vectorized. We ignore values that remain scalar such as
989 /// 64 bit loop indices.
990 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
991
992 /// \return The desired interleave count.
993 /// If interleave count has been specified by metadata it will be returned.
994 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
995 /// are the selected vectorization factor and the cost of the selected VF.
996 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
997
998 /// Memory access instruction may be vectorized in more than one way.
999 /// Form of instruction after vectorization depends on cost.
1000 /// This function takes cost-based decisions for Load/Store instructions
1001 /// and collects them in a map. This decisions map is used for building
1002 /// the lists of loop-uniform and loop-scalar instructions.
1003 /// The calculated cost is saved with widening decision in order to
1004 /// avoid redundant calculations.
1005 void setCostBasedWideningDecision(unsigned VF);
1006
1007 /// A struct that represents some properties of the register usage
1008 /// of a loop.
1009 struct RegisterUsage {
1010 /// Holds the number of loop invariant values that are used in the loop.
1011 /// The key is ClassID of target-provided register class.
1012 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1013 /// Holds the maximum number of concurrent live intervals in the loop.
1014 /// The key is ClassID of target-provided register class.
1015 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1016 };
1017
1018 /// \return Returns information about the register usages of the loop for the
1019 /// given vectorization factors.
1020 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1021
1022 /// Collect values we want to ignore in the cost model.
1023 void collectValuesToIgnore();
1024
1025 /// \returns The smallest bitwidth each instruction can be represented with.
1026 /// The vector equivalents of these instructions should be truncated to this
1027 /// type.
1028 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1029 return MinBWs;
1030 }
1031
1032 /// \returns True if it is more profitable to scalarize instruction \p I for
1033 /// vectorization factor \p VF.
1034 bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1035 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.")((VF > 1 && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1035, __PRETTY_FUNCTION__))
;
1036
1037 // Cost model is not run in the VPlan-native path - return conservative
1038 // result until this changes.
1039 if (EnableVPlanNativePath)
1040 return false;
1041
1042 auto Scalars = InstsToScalarize.find(VF);
1043 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __PRETTY_FUNCTION__))
1044 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __PRETTY_FUNCTION__))
;
1045 return Scalars->second.find(I) != Scalars->second.end();
1046 }
1047
1048 /// Returns true if \p I is known to be uniform after vectorization.
1049 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1050 if (VF == 1)
1051 return true;
1052
1053 // Cost model is not run in the VPlan-native path - return conservative
1054 // result until this changes.
1055 if (EnableVPlanNativePath)
1056 return false;
1057
1058 auto UniformsPerVF = Uniforms.find(VF);
1059 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1060, __PRETTY_FUNCTION__))
1060 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1060, __PRETTY_FUNCTION__))
;
1061 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1062 }
1063
1064 /// Returns true if \p I is known to be scalar after vectorization.
1065 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1066 if (VF == 1)
1067 return true;
1068
1069 // Cost model is not run in the VPlan-native path - return conservative
1070 // result until this changes.
1071 if (EnableVPlanNativePath)
1072 return false;
1073
1074 auto ScalarsPerVF = Scalars.find(VF);
1075 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1076, __PRETTY_FUNCTION__))
1076 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1076, __PRETTY_FUNCTION__))
;
1077 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1078 }
1079
1080 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1081 /// for vectorization factor \p VF.
1082 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1083 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1084 !isProfitableToScalarize(I, VF) &&
1085 !isScalarAfterVectorization(I, VF);
1086 }
1087
1088 /// Decision that was taken during cost calculation for memory instruction.
1089 enum InstWidening {
1090 CM_Unknown,
1091 CM_Widen, // For consecutive accesses with stride +1.
1092 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093 CM_Interleave,
1094 CM_GatherScatter,
1095 CM_Scalarize
1096 };
1097
1098 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1099 /// instruction \p I and vector width \p VF.
1100 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1101 unsigned Cost) {
1102 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1102, __PRETTY_FUNCTION__))
;
1103 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1104 }
1105
1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107 /// interleaving group \p Grp and vector width \p VF.
1108 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1109 InstWidening W, unsigned Cost) {
1110 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1110, __PRETTY_FUNCTION__))
;
1111 /// Broadcast this decicion to all instructions inside the group.
1112 /// But the cost will be assigned to one instruction only.
1113 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1114 if (auto *I = Grp->getMember(i)) {
1115 if (Grp->getInsertPos() == I)
1116 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1117 else
1118 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1119 }
1120 }
1121 }
1122
1123 /// Return the cost model decision for the given instruction \p I and vector
1124 /// width \p VF. Return CM_Unknown if this instruction did not pass
1125 /// through the cost modeling.
1126 InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1127 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1127, __PRETTY_FUNCTION__))
;
1128
1129 // Cost model is not run in the VPlan-native path - return conservative
1130 // result until this changes.
1131 if (EnableVPlanNativePath)
1132 return CM_GatherScatter;
1133
1134 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1135 auto Itr = WideningDecisions.find(InstOnVF);
1136 if (Itr == WideningDecisions.end())
1137 return CM_Unknown;
1138 return Itr->second.first;
1139 }
1140
1141 /// Return the vectorization cost for the given instruction \p I and vector
1142 /// width \p VF.
1143 unsigned getWideningCost(Instruction *I, unsigned VF) {
1144 assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1144, __PRETTY_FUNCTION__))
;
1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1146 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1147, __PRETTY_FUNCTION__))
1147 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1147, __PRETTY_FUNCTION__))
;
1148 return WideningDecisions[InstOnVF].second;
1149 }
1150
1151 /// Return True if instruction \p I is an optimizable truncate whose operand
1152 /// is an induction variable. Such a truncate will be removed by adding a new
1153 /// induction variable with the destination type.
1154 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1155 // If the instruction is not a truncate, return false.
1156 auto *Trunc = dyn_cast<TruncInst>(I);
1157 if (!Trunc)
1158 return false;
1159
1160 // Get the source and destination types of the truncate.
1161 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1162 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1163
1164 // If the truncate is free for the given types, return false. Replacing a
1165 // free truncate with an induction variable would add an induction variable
1166 // update instruction to each iteration of the loop. We exclude from this
1167 // check the primary induction variable since it will need an update
1168 // instruction regardless.
1169 Value *Op = Trunc->getOperand(0);
1170 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1171 return false;
1172
1173 // If the truncated value is not an induction variable, return false.
1174 return Legal->isInductionPhi(Op);
1175 }
1176
1177 /// Collects the instructions to scalarize for each predicated instruction in
1178 /// the loop.
1179 void collectInstsToScalarize(unsigned VF);
1180
1181 /// Collect Uniform and Scalar values for the given \p VF.
1182 /// The sets depend on CM decision for Load/Store instructions
1183 /// that may be vectorized as interleave, gather-scatter or scalarized.
1184 void collectUniformsAndScalars(unsigned VF) {
1185 // Do the analysis once.
1186 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1187 return;
1188 setCostBasedWideningDecision(VF);
1189 collectLoopUniforms(VF);
1190 collectLoopScalars(VF);
1191 }
1192
1193 /// Returns true if the target machine supports masked store operation
1194 /// for the given \p DataType and kind of access to \p Ptr.
1195 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1196 return Legal->isConsecutivePtr(Ptr) &&
1197 TTI.isLegalMaskedStore(DataType, Alignment);
1198 }
1199
1200 /// Returns true if the target machine supports masked load operation
1201 /// for the given \p DataType and kind of access to \p Ptr.
1202 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203 return Legal->isConsecutivePtr(Ptr) &&
1204 TTI.isLegalMaskedLoad(DataType, Alignment);
1205 }
1206
1207 /// Returns true if the target machine supports masked scatter operation
1208 /// for the given \p DataType.
1209 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1210 return TTI.isLegalMaskedScatter(DataType, Alignment);
1211 }
1212
1213 /// Returns true if the target machine supports masked gather operation
1214 /// for the given \p DataType.
1215 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1216 return TTI.isLegalMaskedGather(DataType, Alignment);
1217 }
1218
1219 /// Returns true if the target machine can represent \p V as a masked gather
1220 /// or scatter operation.
1221 bool isLegalGatherOrScatter(Value *V) {
1222 bool LI = isa<LoadInst>(V);
1223 bool SI = isa<StoreInst>(V);
1224 if (!LI && !SI)
1225 return false;
1226 auto *Ty = getMemInstValueType(V);
1227 MaybeAlign Align = getLoadStoreAlignment(V);
1228 return (LI && isLegalMaskedGather(Ty, Align)) ||
1229 (SI && isLegalMaskedScatter(Ty, Align));
1230 }
1231
1232 /// Returns true if \p I is an instruction that will be scalarized with
1233 /// predication. Such instructions include conditional stores and
1234 /// instructions that may divide by zero.
1235 /// If a non-zero VF has been calculated, we check if I will be scalarized
1236 /// predication for that VF.
1237 bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1238
1239 // Returns true if \p I is an instruction that will be predicated either
1240 // through scalar predication or masked load/store or masked gather/scatter.
1241 // Superset of instructions that return true for isScalarWithPredication.
1242 bool isPredicatedInst(Instruction *I) {
1243 if (!blockNeedsPredication(I->getParent()))
1244 return false;
1245 // Loads and stores that need some form of masked operation are predicated
1246 // instructions.
1247 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1248 return Legal->isMaskRequired(I);
1249 return isScalarWithPredication(I);
1250 }
1251
1252 /// Returns true if \p I is a memory instruction with consecutive memory
1253 /// access that can be widened.
1254 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1255
1256 /// Returns true if \p I is a memory instruction in an interleaved-group
1257 /// of memory accesses that can be vectorized with wide vector loads/stores
1258 /// and shuffles.
1259 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1260
1261 /// Check if \p Instr belongs to any interleaved access group.
1262 bool isAccessInterleaved(Instruction *Instr) {
1263 return InterleaveInfo.isInterleaved(Instr);
1264 }
1265
1266 /// Get the interleaved access group that \p Instr belongs to.
1267 const InterleaveGroup<Instruction> *
1268 getInterleavedAccessGroup(Instruction *Instr) {
1269 return InterleaveInfo.getInterleaveGroup(Instr);
1270 }
1271
1272 /// Returns true if an interleaved group requires a scalar iteration
1273 /// to handle accesses with gaps, and there is nothing preventing us from
1274 /// creating a scalar epilogue.
1275 bool requiresScalarEpilogue() const {
1276 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1277 }
1278
1279 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1280 /// loop hint annotation.
1281 bool isScalarEpilogueAllowed() const {
1282 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1283 }
1284
1285 /// Returns true if all loop blocks should be masked to fold tail loop.
1286 bool foldTailByMasking() const { return FoldTailByMasking; }
1287
1288 bool blockNeedsPredication(BasicBlock *BB) {
1289 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1290 }
1291
1292 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1293 /// with factor VF. Return the cost of the instruction, including
1294 /// scalarization overhead if it's needed.
1295 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1296
1297 /// Estimate cost of a call instruction CI if it were vectorized with factor
1298 /// VF. Return the cost of the instruction, including scalarization overhead
1299 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1300 /// scalarized -
1301 /// i.e. either vector version isn't available, or is too expensive.
1302 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1303
1304private:
1305 unsigned NumPredStores = 0;
1306
1307 /// \return An upper bound for the vectorization factor, larger than zero.
1308 /// One is returned if vectorization should best be avoided due to cost.
1309 unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1310
1311 /// The vectorization cost is a combination of the cost itself and a boolean
1312 /// indicating whether any of the contributing operations will actually
1313 /// operate on
1314 /// vector values after type legalization in the backend. If this latter value
1315 /// is
1316 /// false, then all operations will be scalarized (i.e. no vectorization has
1317 /// actually taken place).
1318 using VectorizationCostTy = std::pair<unsigned, bool>;
1319
1320 /// Returns the expected execution cost. The unit of the cost does
1321 /// not matter because we use the 'cost' units to compare different
1322 /// vector widths. The cost that is returned is *not* normalized by
1323 /// the factor width.
1324 VectorizationCostTy expectedCost(unsigned VF);
1325
1326 /// Returns the execution time cost of an instruction for a given vector
1327 /// width. Vector width of one means scalar.
1328 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1329
1330 /// The cost-computation logic from getInstructionCost which provides
1331 /// the vector type as an output parameter.
1332 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1333
1334 /// Calculate vectorization cost of memory instruction \p I.
1335 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1336
1337 /// The cost computation for scalarized memory instruction.
1338 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1339
1340 /// The cost computation for interleaving group of memory instructions.
1341 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1342
1343 /// The cost computation for Gather/Scatter instruction.
1344 unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1345
1346 /// The cost computation for widening instruction \p I with consecutive
1347 /// memory access.
1348 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1349
1350 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1351 /// Load: scalar load + broadcast.
1352 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1353 /// element)
1354 unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1355
1356 /// Estimate the overhead of scalarizing an instruction. This is a
1357 /// convenience wrapper for the type-based getScalarizationOverhead API.
1358 unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1359
1360 /// Returns whether the instruction is a load or store and will be a emitted
1361 /// as a vector operation.
1362 bool isConsecutiveLoadOrStore(Instruction *I);
1363
1364 /// Returns true if an artificially high cost for emulated masked memrefs
1365 /// should be used.
1366 bool useEmulatedMaskMemRefHack(Instruction *I);
1367
1368 /// Map of scalar integer values to the smallest bitwidth they can be legally
1369 /// represented as. The vector equivalents of these values should be truncated
1370 /// to this type.
1371 MapVector<Instruction *, uint64_t> MinBWs;
1372
1373 /// A type representing the costs for instructions if they were to be
1374 /// scalarized rather than vectorized. The entries are Instruction-Cost
1375 /// pairs.
1376 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1377
1378 /// A set containing all BasicBlocks that are known to present after
1379 /// vectorization as a predicated block.
1380 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1381
1382 /// Records whether it is allowed to have the original scalar loop execute at
1383 /// least once. This may be needed as a fallback loop in case runtime
1384 /// aliasing/dependence checks fail, or to handle the tail/remainder
1385 /// iterations when the trip count is unknown or doesn't divide by the VF,
1386 /// or as a peel-loop to handle gaps in interleave-groups.
1387 /// Under optsize and when the trip count is very small we don't allow any
1388 /// iterations to execute in the scalar loop.
1389 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1390
1391 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1392 bool FoldTailByMasking = false;
1393
1394 /// A map holding scalar costs for different vectorization factors. The
1395 /// presence of a cost for an instruction in the mapping indicates that the
1396 /// instruction will be scalarized when vectorizing with the associated
1397 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1398 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1399
1400 /// Holds the instructions known to be uniform after vectorization.
1401 /// The data is collected per VF.
1402 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1403
1404 /// Holds the instructions known to be scalar after vectorization.
1405 /// The data is collected per VF.
1406 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1407
1408 /// Holds the instructions (address computations) that are forced to be
1409 /// scalarized.
1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1411
1412 /// Returns the expected difference in cost from scalarizing the expression
1413 /// feeding a predicated instruction \p PredInst. The instructions to
1414 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1415 /// non-negative return value implies the expression will be scalarized.
1416 /// Currently, only single-use chains are considered for scalarization.
1417 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1418 unsigned VF);
1419
1420 /// Collect the instructions that are uniform after vectorization. An
1421 /// instruction is uniform if we represent it with a single scalar value in
1422 /// the vectorized loop corresponding to each vector iteration. Examples of
1423 /// uniform instructions include pointer operands of consecutive or
1424 /// interleaved memory accesses. Note that although uniformity implies an
1425 /// instruction will be scalar, the reverse is not true. In general, a
1426 /// scalarized instruction will be represented by VF scalar values in the
1427 /// vectorized loop, each corresponding to an iteration of the original
1428 /// scalar loop.
1429 void collectLoopUniforms(unsigned VF);
1430
1431 /// Collect the instructions that are scalar after vectorization. An
1432 /// instruction is scalar if it is known to be uniform or will be scalarized
1433 /// during vectorization. Non-uniform scalarized instructions will be
1434 /// represented by VF values in the vectorized loop, each corresponding to an
1435 /// iteration of the original scalar loop.
1436 void collectLoopScalars(unsigned VF);
1437
1438 /// Keeps cost model vectorization decision and cost for instructions.
1439 /// Right now it is used for memory instructions only.
1440 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1441 std::pair<InstWidening, unsigned>>;
1442
1443 DecisionList WideningDecisions;
1444
1445 /// Returns true if \p V is expected to be vectorized and it needs to be
1446 /// extracted.
1447 bool needsExtract(Value *V, unsigned VF) const {
1448 Instruction *I = dyn_cast<Instruction>(V);
1449 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1450 return false;
1451
1452 // Assume we can vectorize V (and hence we need extraction) if the
1453 // scalars are not computed yet. This can happen, because it is called
1454 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1455 // the scalars are collected. That should be a safe assumption in most
1456 // cases, because we check if the operands have vectorizable types
1457 // beforehand in LoopVectorizationLegality.
1458 return Scalars.find(VF) == Scalars.end() ||
1459 !isScalarAfterVectorization(I, VF);
1460 };
1461
1462 /// Returns a range containing only operands needing to be extracted.
1463 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1464 unsigned VF) {
1465 return SmallVector<Value *, 4>(make_filter_range(
1466 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1467 }
1468
1469public:
1470 /// The loop that we evaluate.
1471 Loop *TheLoop;
1472
1473 /// Predicated scalar evolution analysis.
1474 PredicatedScalarEvolution &PSE;
1475
1476 /// Loop Info analysis.
1477 LoopInfo *LI;
1478
1479 /// Vectorization legality.
1480 LoopVectorizationLegality *Legal;
1481
1482 /// Vector target information.
1483 const TargetTransformInfo &TTI;
1484
1485 /// Target Library Info.
1486 const TargetLibraryInfo *TLI;
1487
1488 /// Demanded bits analysis.
1489 DemandedBits *DB;
1490
1491 /// Assumption cache.
1492 AssumptionCache *AC;
1493
1494 /// Interface to emit optimization remarks.
1495 OptimizationRemarkEmitter *ORE;
1496
1497 const Function *TheFunction;
1498
1499 /// Loop Vectorize Hint.
1500 const LoopVectorizeHints *Hints;
1501
1502 /// The interleave access information contains groups of interleaved accesses
1503 /// with the same stride and close to each other.
1504 InterleavedAccessInfo &InterleaveInfo;
1505
1506 /// Values to ignore in the cost model.
1507 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1508
1509 /// Values to ignore in the cost model when VF > 1.
1510 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511};
1512
1513} // end namespace llvm
1514
1515// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516// vectorization. The loop needs to be annotated with #pragma omp simd
1517// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518// vector length information is not provided, vectorization is not considered
1519// explicit. Interleave hints are not allowed either. These limitations will be
1520// relaxed in the future.
1521// Please, note that we are currently forced to abuse the pragma 'clang
1522// vectorize' semantics. This pragma provides *auto-vectorization hints*
1523// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524// provides *explicit vectorization hints* (LV can bypass legal checks and
1525// assume that vectorization is legal). However, both hints are implemented
1526// using the same metadata (llvm.loop.vectorize, processed by
1527// LoopVectorizeHints). This will be fixed in the future when the native IR
1528// representation for pragma 'omp simd' is introduced.
1529static bool isExplicitVecOuterLoop(Loop *OuterLp,
1530 OptimizationRemarkEmitter *ORE) {
1531 assert(!OuterLp->empty() && "This is not an outer loop")((!OuterLp->empty() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->empty() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1531, __PRETTY_FUNCTION__))
;
1532 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1533
1534 // Only outer loops with an explicit vectorization hint are supported.
1535 // Unannotated outer loops are ignored.
1536 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1537 return false;
1538
1539 Function *Fn = OuterLp->getHeader()->getParent();
1540 if (!Hints.allowVectorization(Fn, OuterLp,
1541 true /*VectorizeOnlyWhenForced*/)) {
1542 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
1543 return false;
1544 }
1545
1546 if (Hints.getInterleave() > 1) {
1547 // TODO: Interleave support is future work.
1548 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
1549 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
1550 Hints.emitRemarkWithHints();
1551 return false;
1552 }
1553
1554 return true;
1555}
1556
1557static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1558 OptimizationRemarkEmitter *ORE,
1559 SmallVectorImpl<Loop *> &V) {
1560 // Collect inner loops and outer loops without irreducible control flow. For
1561 // now, only collect outer loops that have explicit vectorization hints. If we
1562 // are stress testing the VPlan H-CFG construction, we collect the outermost
1563 // loop of every loop nest.
1564 if (L.empty() || VPlanBuildStressTest ||
1565 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1566 LoopBlocksRPO RPOT(&L);
1567 RPOT.perform(LI);
1568 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1569 V.push_back(&L);
1570 // TODO: Collect inner loops inside marked outer loops in case
1571 // vectorization fails for the outer loop. Do not invoke
1572 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1573 // already known to be reducible. We can use an inherited attribute for
1574 // that.
1575 return;
1576 }
1577 }
1578 for (Loop *InnerL : L)
1579 collectSupportedLoops(*InnerL, LI, ORE, V);
1580}
1581
1582namespace {
1583
1584/// The LoopVectorize Pass.
1585struct LoopVectorize : public FunctionPass {
1586 /// Pass identification, replacement for typeid
1587 static char ID;
1588
1589 LoopVectorizePass Impl;
1590
1591 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1592 bool VectorizeOnlyWhenForced = false)
1593 : FunctionPass(ID) {
1594 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1595 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1596 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1597 }
1598
1599 bool runOnFunction(Function &F) override {
1600 if (skipFunction(F))
1601 return false;
1602
1603 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1605 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1607 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1608 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1609 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1610 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1611 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1612 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1613 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1614 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1615 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1616
1617 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1618 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1619
1620 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1621 GetLAA, *ORE, PSI);
1622 }
1623
1624 void getAnalysisUsage(AnalysisUsage &AU) const override {
1625 AU.addRequired<AssumptionCacheTracker>();
1626 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1627 AU.addRequired<DominatorTreeWrapperPass>();
1628 AU.addRequired<LoopInfoWrapperPass>();
1629 AU.addRequired<ScalarEvolutionWrapperPass>();
1630 AU.addRequired<TargetTransformInfoWrapperPass>();
1631 AU.addRequired<AAResultsWrapperPass>();
1632 AU.addRequired<LoopAccessLegacyAnalysis>();
1633 AU.addRequired<DemandedBitsWrapperPass>();
1634 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1635 AU.addRequired<InjectTLIMappingsLegacy>();
1636
1637 // We currently do not preserve loopinfo/dominator analyses with outer loop
1638 // vectorization. Until this is addressed, mark these analyses as preserved
1639 // only for non-VPlan-native path.
1640 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1641 if (!EnableVPlanNativePath) {
1642 AU.addPreserved<LoopInfoWrapperPass>();
1643 AU.addPreserved<DominatorTreeWrapperPass>();
1644 }
1645
1646 AU.addPreserved<BasicAAWrapperPass>();
1647 AU.addPreserved<GlobalsAAWrapperPass>();
1648 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1649 }
1650};
1651
1652} // end anonymous namespace
1653
1654//===----------------------------------------------------------------------===//
1655// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656// LoopVectorizationCostModel and LoopVectorizationPlanner.
1657//===----------------------------------------------------------------------===//
1658
1659Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1660 // We need to place the broadcast of invariant variables outside the loop,
1661 // but only if it's proven safe to do so. Else, broadcast will be inside
1662 // vector loop body.
1663 Instruction *Instr = dyn_cast<Instruction>(V);
1664 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1665 (!Instr ||
1666 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1667 // Place the code for broadcasting invariant variables in the new preheader.
1668 IRBuilder<>::InsertPointGuard Guard(Builder);
1669 if (SafeToHoist)
1670 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1671
1672 // Broadcast the scalar into all locations in the vector.
1673 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1674
1675 return Shuf;
1676}
1677
1678void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1679 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1680 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1681, __PRETTY_FUNCTION__))
1681 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1681, __PRETTY_FUNCTION__))
;
1682 Value *Start = II.getStartValue();
1683
1684 // Construct the initial value of the vector IV in the vector loop preheader
1685 auto CurrIP = Builder.saveIP();
1686 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1687 if (isa<TruncInst>(EntryVal)) {
1688 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1689, __PRETTY_FUNCTION__))
1689 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1689, __PRETTY_FUNCTION__))
;
1690 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1691 Step = Builder.CreateTrunc(Step, TruncType);
1692 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1693 }
1694 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1695 Value *SteppedStart =
1696 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1697
1698 // We create vector phi nodes for both integer and floating-point induction
1699 // variables. Here, we determine the kind of arithmetic we will perform.
1700 Instruction::BinaryOps AddOp;
1701 Instruction::BinaryOps MulOp;
1702 if (Step->getType()->isIntegerTy()) {
1703 AddOp = Instruction::Add;
1704 MulOp = Instruction::Mul;
1705 } else {
1706 AddOp = II.getInductionOpcode();
1707 MulOp = Instruction::FMul;
1708 }
1709
1710 // Multiply the vectorization factor by the step using integer or
1711 // floating-point arithmetic as appropriate.
1712 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1713 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1714
1715 // Create a vector splat to use in the induction update.
1716 //
1717 // FIXME: If the step is non-constant, we create the vector splat with
1718 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1719 // handle a constant vector splat.
1720 Value *SplatVF = isa<Constant>(Mul)
1721 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1722 : Builder.CreateVectorSplat(VF, Mul);
1723 Builder.restoreIP(CurrIP);
1724
1725 // We may need to add the step a number of times, depending on the unroll
1726 // factor. The last of those goes into the PHI.
1727 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1728 &*LoopVectorBody->getFirstInsertionPt());
1729 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1730 Instruction *LastInduction = VecInd;
1731 for (unsigned Part = 0; Part < UF; ++Part) {
1732 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1733
1734 if (isa<TruncInst>(EntryVal))
1735 addMetadata(LastInduction, EntryVal);
1736 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1737
1738 LastInduction = cast<Instruction>(addFastMathFlag(
1739 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1740 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1741 }
1742
1743 // Move the last step to the end of the latch block. This ensures consistent
1744 // placement of all induction updates.
1745 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1746 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1747 auto *ICmp = cast<Instruction>(Br->getCondition());
1748 LastInduction->moveBefore(ICmp);
1749 LastInduction->setName("vec.ind.next");
1750
1751 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1752 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1753}
1754
1755bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1756 return Cost->isScalarAfterVectorization(I, VF) ||
1757 Cost->isProfitableToScalarize(I, VF);
1758}
1759
1760bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1761 if (shouldScalarizeInstruction(IV))
1762 return true;
1763 auto isScalarInst = [&](User *U) -> bool {
1764 auto *I = cast<Instruction>(U);
1765 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1766 };
1767 return llvm::any_of(IV->users(), isScalarInst);
1768}
1769
1770void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1771 const InductionDescriptor &ID, const Instruction *EntryVal,
1772 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1773 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1774, __PRETTY_FUNCTION__))
1774 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1774, __PRETTY_FUNCTION__))
;
1775
1776 // This induction variable is not the phi from the original loop but the
1777 // newly-created IV based on the proof that casted Phi is equal to the
1778 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1779 // re-uses the same InductionDescriptor that original IV uses but we don't
1780 // have to do any recording in this case - that is done when original IV is
1781 // processed.
1782 if (isa<TruncInst>(EntryVal))
1783 return;
1784
1785 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1786 if (Casts.empty())
1787 return;
1788 // Only the first Cast instruction in the Casts vector is of interest.
1789 // The rest of the Casts (if exist) have no uses outside the
1790 // induction update chain itself.
1791 Instruction *CastInst = *Casts.begin();
1792 if (Lane < UINT_MAX(2147483647 *2U +1U))
1793 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1794 else
1795 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1796}
1797
1798void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1799 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1800, __PRETTY_FUNCTION__))
1800 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1800, __PRETTY_FUNCTION__))
;
1801
1802 auto II = Legal->getInductionVars().find(IV);
1803 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1803, __PRETTY_FUNCTION__))
;
1804
1805 auto ID = II->second;
1806 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1806, __PRETTY_FUNCTION__))
;
1807
1808 // The scalar value to broadcast. This will be derived from the canonical
1809 // induction variable.
1810 Value *ScalarIV = nullptr;
1811
1812 // The value from the original loop to which we are mapping the new induction
1813 // variable.
1814 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1815
1816 // True if we have vectorized the induction variable.
1817 auto VectorizedIV = false;
1818
1819 // Determine if we want a scalar version of the induction variable. This is
1820 // true if the induction variable itself is not widened, or if it has at
1821 // least one user in the loop that is not widened.
1822 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1823
1824 // Generate code for the induction step. Note that induction steps are
1825 // required to be loop-invariant
1826 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
"Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1827, __PRETTY_FUNCTION__))
1827 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
"Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1827, __PRETTY_FUNCTION__))
;
1828 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1829 Value *Step = nullptr;
1830 if (PSE.getSE()->isSCEVable(IV->getType())) {
1831 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1832 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1833 LoopVectorPreHeader->getTerminator());
1834 } else {
1835 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1836 }
1837
1838 // Try to create a new independent vector induction variable. If we can't
1839 // create the phi node, we will splat the scalar induction variable in each
1840 // loop iteration.
1841 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1842 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1843 VectorizedIV = true;
1844 }
1845
1846 // If we haven't yet vectorized the induction variable, or if we will create
1847 // a scalar one, we need to define the scalar induction variable and step
1848 // values. If we were given a truncation type, truncate the canonical
1849 // induction variable and step. Otherwise, derive these values from the
1850 // induction descriptor.
1851 if (!VectorizedIV || NeedsScalarIV) {
1852 ScalarIV = Induction;
1853 if (IV != OldInduction) {
1854 ScalarIV = IV->getType()->isIntegerTy()
1855 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1856 : Builder.CreateCast(Instruction::SIToFP, Induction,
1857 IV->getType());
1858 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1859 ScalarIV->setName("offset.idx");
1860 }
1861 if (Trunc) {
1862 auto *TruncType = cast<IntegerType>(Trunc->getType());
1863 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1864, __PRETTY_FUNCTION__))
1864 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1864, __PRETTY_FUNCTION__))
;
1865 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1866 Step = Builder.CreateTrunc(Step, TruncType);
1867 }
1868 }
1869
1870 // If we haven't yet vectorized the induction variable, splat the scalar
1871 // induction variable, and build the necessary step vectors.
1872 // TODO: Don't do it unless the vectorized IV is really required.
1873 if (!VectorizedIV) {
1874 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1875 for (unsigned Part = 0; Part < UF; ++Part) {
1876 Value *EntryPart =
1877 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1878 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1879 if (Trunc)
1880 addMetadata(EntryPart, Trunc);
1881 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1882 }
1883 }
1884
1885 // If an induction variable is only used for counting loop iterations or
1886 // calculating addresses, it doesn't need to be widened. Create scalar steps
1887 // that can be used by instructions we will later scalarize. Note that the
1888 // addition of the scalar steps will not increase the number of instructions
1889 // in the loop in the common case prior to InstCombine. We will be trading
1890 // one vector extract for each scalar step.
1891 if (NeedsScalarIV)
1892 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1893}
1894
1895Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1896 Instruction::BinaryOps BinOp) {
1897 // Create and check the types.
1898 assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector"
) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1898, __PRETTY_FUNCTION__))
;
1899 int VLen = Val->getType()->getVectorNumElements();
1900
1901 Type *STy = Val->getType()->getScalarType();
1902 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1903, __PRETTY_FUNCTION__))
1903 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1903, __PRETTY_FUNCTION__))
;
1904 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1904, __PRETTY_FUNCTION__))
;
1905
1906 SmallVector<Constant *, 8> Indices;
1907
1908 if (STy->isIntegerTy()) {
1909 // Create a vector of consecutive numbers from zero to VF.
1910 for (int i = 0; i < VLen; ++i)
1911 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1912
1913 // Add the consecutive indices to the vector value.
1914 Constant *Cv = ConstantVector::get(Indices);
1915 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1915, __PRETTY_FUNCTION__))
;
1916 Step = Builder.CreateVectorSplat(VLen, Step);
1917 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1917, __PRETTY_FUNCTION__))
;
1918 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1919 // which can be found from the original scalar operations.
1920 Step = Builder.CreateMul(Cv, Step);
1921 return Builder.CreateAdd(Val, Step, "induction");
1922 }
1923
1924 // Floating point induction.
1925 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1926, __PRETTY_FUNCTION__))
1926 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1926, __PRETTY_FUNCTION__))
;
1927 // Create a vector of consecutive numbers from zero to VF.
1928 for (int i = 0; i < VLen; ++i)
1929 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1930
1931 // Add the consecutive indices to the vector value.
1932 Constant *Cv = ConstantVector::get(Indices);
1933
1934 Step = Builder.CreateVectorSplat(VLen, Step);
1935
1936 // Floating point operations had to be 'fast' to enable the induction.
1937 FastMathFlags Flags;
1938 Flags.setFast();
1939
1940 Value *MulOp = Builder.CreateFMul(Cv, Step);
1941 if (isa<Instruction>(MulOp))
1942 // Have to check, MulOp may be a constant
1943 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1944
1945 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1946 if (isa<Instruction>(BOp))
1947 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1948 return BOp;
1949}
1950
1951void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1952 Instruction *EntryVal,
1953 const InductionDescriptor &ID) {
1954 // We shouldn't have to build scalar steps if we aren't vectorizing.
1955 assert(VF > 1 && "VF should be greater than one")((VF > 1 && "VF should be greater than one") ? static_cast
<void> (0) : __assert_fail ("VF > 1 && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__))
;
1956
1957 // Get the value type and ensure it and the step have the same integer type.
1958 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1959 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1960, __PRETTY_FUNCTION__))
1960 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1960, __PRETTY_FUNCTION__))
;
1961
1962 // We build scalar steps for both integer and floating-point induction
1963 // variables. Here, we determine the kind of arithmetic we will perform.
1964 Instruction::BinaryOps AddOp;
1965 Instruction::BinaryOps MulOp;
1966 if (ScalarIVTy->isIntegerTy()) {
1967 AddOp = Instruction::Add;
1968 MulOp = Instruction::Mul;
1969 } else {
1970 AddOp = ID.getInductionOpcode();
1971 MulOp = Instruction::FMul;
1972 }
1973
1974 // Determine the number of scalars we need to generate for each unroll
1975 // iteration. If EntryVal is uniform, we only need to generate the first
1976 // lane. Otherwise, we generate all VF values.
1977 unsigned Lanes =
1978 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1979 : VF;
1980 // Compute the scalar steps and save the results in VectorLoopValueMap.
1981 for (unsigned Part = 0; Part < UF; ++Part) {
1982 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1983 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1984 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1985 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1986 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1987 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1988 }
1989 }
1990}
1991
1992Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1993 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1993, __PRETTY_FUNCTION__))
;
1994 assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1994, __PRETTY_FUNCTION__))
;
1995 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1995, __PRETTY_FUNCTION__))
;
1996
1997 // If we have a stride that is replaced by one, do it here. Defer this for
1998 // the VPlan-native path until we start running Legal checks in that path.
1999 if (!EnableVPlanNativePath && Legal->hasStride(V))
2000 V = ConstantInt::get(V->getType(), 1);
2001
2002 // If we have a vector mapped to this value, return it.
2003 if (VectorLoopValueMap.hasVectorValue(V, Part))
2004 return VectorLoopValueMap.getVectorValue(V, Part);
2005
2006 // If the value has not been vectorized, check if it has been scalarized
2007 // instead. If it has been scalarized, and we actually need the value in
2008 // vector form, we will construct the vector values on demand.
2009 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2010 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2011
2012 // If we've scalarized a value, that value should be an instruction.
2013 auto *I = cast<Instruction>(V);
2014
2015 // If we aren't vectorizing, we can just copy the scalar map values over to
2016 // the vector map.
2017 if (VF == 1) {
2018 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2019 return ScalarValue;
2020 }
2021
2022 // Get the last scalar instruction we generated for V and Part. If the value
2023 // is known to be uniform after vectorization, this corresponds to lane zero
2024 // of the Part unroll iteration. Otherwise, the last instruction is the one
2025 // we created for the last vector lane of the Part unroll iteration.
2026 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2027 auto *LastInst = cast<Instruction>(
2028 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2029
2030 // Set the insert point after the last scalarized instruction. This ensures
2031 // the insertelement sequence will directly follow the scalar definitions.
2032 auto OldIP = Builder.saveIP();
2033 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2034 Builder.SetInsertPoint(&*NewIP);
2035
2036 // However, if we are vectorizing, we need to construct the vector values.
2037 // If the value is known to be uniform after vectorization, we can just
2038 // broadcast the scalar value corresponding to lane zero for each unroll
2039 // iteration. Otherwise, we construct the vector values using insertelement
2040 // instructions. Since the resulting vectors are stored in
2041 // VectorLoopValueMap, we will only generate the insertelements once.
2042 Value *VectorValue = nullptr;
2043 if (Cost->isUniformAfterVectorization(I, VF)) {
2044 VectorValue = getBroadcastInstrs(ScalarValue);
2045 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2046 } else {
2047 // Initialize packing with insertelements to start from undef.
2048 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2049 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2050 for (unsigned Lane = 0; Lane < VF; ++Lane)
2051 packScalarIntoVectorValue(V, {Part, Lane});
2052 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2053 }
2054 Builder.restoreIP(OldIP);
2055 return VectorValue;
2056 }
2057
2058 // If this scalar is unknown, assume that it is a constant or that it is
2059 // loop invariant. Broadcast V and save the value for future uses.
2060 Value *B = getBroadcastInstrs(V);
2061 VectorLoopValueMap.setVectorValue(V, Part, B);
2062 return B;
2063}
2064
2065Value *
2066InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2067 const VPIteration &Instance) {
2068 // If the value is not an instruction contained in the loop, it should
2069 // already be scalar.
2070 if (OrigLoop->isLoopInvariant(V))
2071 return V;
2072
2073 assert(Instance.Lane > 0((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
2074 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
2075 : true && "Uniform values only have lane zero")((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
;
2076
2077 // If the value from the original loop has not been vectorized, it is
2078 // represented by UF x VF scalar values in the new loop. Return the requested
2079 // scalar value.
2080 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2081 return VectorLoopValueMap.getScalarValue(V, Instance);
2082
2083 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2084 // for the given unroll part. If this entry is not a vector type (i.e., the
2085 // vectorization factor is one), there is no need to generate an
2086 // extractelement instruction.
2087 auto *U = getOrCreateVectorValue(V, Instance.Part);
2088 if (!U->getType()->isVectorTy()) {
2089 assert(VF == 1 && "Value not scalarized has non-vector type")((VF == 1 && "Value not scalarized has non-vector type"
) ? static_cast<void> (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2089, __PRETTY_FUNCTION__))
;
2090 return U;
2091 }
2092
2093 // Otherwise, the value from the original loop has been vectorized and is
2094 // represented by UF vector values. Extract and return the requested scalar
2095 // value from the appropriate vector lane.
2096 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2097}
2098
2099void InnerLoopVectorizer::packScalarIntoVectorValue(
2100 Value *V, const VPIteration &Instance) {
2101 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2101, __PRETTY_FUNCTION__))
;
2102 assert(!V->getType()->isVectorTy() && "Can't pack a vector")((!V->getType()->isVectorTy() && "Can't pack a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2102, __PRETTY_FUNCTION__))
;
2103 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2103, __PRETTY_FUNCTION__))
;
2104
2105 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2106 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2107 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2108 Builder.getInt32(Instance.Lane));
2109 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2110}
2111
2112Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2113 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2113, __PRETTY_FUNCTION__))
;
2114 SmallVector<Constant *, 8> ShuffleMask;
2115 for (unsigned i = 0; i < VF; ++i)
2116 ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2117
2118 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2119 ConstantVector::get(ShuffleMask),
2120 "reverse");
2121}
2122
2123// Return whether we allow using masked interleave-groups (for dealing with
2124// strided loads/stores that reside in predicated blocks, or for dealing
2125// with gaps).
2126static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2127 // If an override option has been passed in for interleaved accesses, use it.
2128 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2129 return EnableMaskedInterleavedMemAccesses;
2130
2131 return TTI.enableMaskedInterleavedAccessVectorization();
2132}
2133
2134// Try to vectorize the interleave group that \p Instr belongs to.
2135//
2136// E.g. Translate following interleaved load group (factor = 3):
2137// for (i = 0; i < N; i+=3) {
2138// R = Pic[i]; // Member of index 0
2139// G = Pic[i+1]; // Member of index 1
2140// B = Pic[i+2]; // Member of index 2
2141// ... // do something to R, G, B
2142// }
2143// To:
2144// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2145// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2146// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2147// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2148//
2149// Or translate following interleaved store group (factor = 3):
2150// for (i = 0; i < N; i+=3) {
2151// ... do something to R, G, B
2152// Pic[i] = R; // Member of index 0
2153// Pic[i+1] = G; // Member of index 1
2154// Pic[i+2] = B; // Member of index 2
2155// }
2156// To:
2157// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2158// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2159// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2160// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2161// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2162void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2163 VPTransformState &State,
2164 VPValue *Addr,
2165 VPValue *BlockInMask) {
2166 const InterleaveGroup<Instruction> *Group =
2167 Cost->getInterleavedAccessGroup(Instr);
2168 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2168, __PRETTY_FUNCTION__))
;
2169
2170 // Skip if current instruction is not the insert position.
2171 if (Instr != Group->getInsertPos())
2172 return;
2173
2174 const DataLayout &DL = Instr->getModule()->getDataLayout();
2175
2176 // Prepare for the vector type of the interleaved load/store.
2177 Type *ScalarTy = getMemInstValueType(Instr);
2178 unsigned InterleaveFactor = Group->getFactor();
2179 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2180
2181 // Prepare for the new pointers.
2182 SmallVector<Value *, 2> AddrParts;
2183 unsigned Index = Group->getIndex(Instr);
2184
2185 // TODO: extend the masked interleaved-group support to reversed access.
2186 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2187, __PRETTY_FUNCTION__))
2187 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2187, __PRETTY_FUNCTION__))
;
2188
2189 // If the group is reverse, adjust the index to refer to the last vector lane
2190 // instead of the first. We adjust the index from the first vector lane,
2191 // rather than directly getting the pointer for lane VF - 1, because the
2192 // pointer operand of the interleaved access is supposed to be uniform. For
2193 // uniform instructions, we're only required to generate a value for the
2194 // first vector lane in each unroll iteration.
2195 if (Group->isReverse())
2196 Index += (VF - 1) * Group->getFactor();
2197
2198 for (unsigned Part = 0; Part < UF; Part++) {
2199 Value *AddrPart = State.get(Addr, {Part, 0});
2200 setDebugLocFromInst(Builder, AddrPart);
2201
2202 // Notice current instruction could be any index. Need to adjust the address
2203 // to the member of index 0.
2204 //
2205 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2206 // b = A[i]; // Member of index 0
2207 // Current pointer is pointed to A[i+1], adjust it to A[i].
2208 //
2209 // E.g. A[i+1] = a; // Member of index 1
2210 // A[i] = b; // Member of index 0
2211 // A[i+2] = c; // Member of index 2 (Current instruction)
2212 // Current pointer is pointed to A[i+2], adjust it to A[i].
2213
2214 bool InBounds = false;
2215 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2216 InBounds = gep->isInBounds();
2217 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2218 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2219
2220 // Cast to the vector pointer type.
2221 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2222 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2223 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2224 }
2225
2226 setDebugLocFromInst(Builder, Instr);
2227 Value *UndefVec = UndefValue::get(VecTy);
2228
2229 Value *MaskForGaps = nullptr;
2230 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2231 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2232 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2232, __PRETTY_FUNCTION__))
;
2233 }
2234
2235 // Vectorize the interleaved load group.
2236 if (isa<LoadInst>(Instr)) {
2237 // For each unroll part, create a wide load for the group.
2238 SmallVector<Value *, 2> NewLoads;
2239 for (unsigned Part = 0; Part < UF; Part++) {
2240 Instruction *NewLoad;
2241 if (BlockInMask || MaskForGaps) {
2242 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2243, __PRETTY_FUNCTION__))
2243 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2243, __PRETTY_FUNCTION__))
;
2244 Value *GroupMask = MaskForGaps;
2245 if (BlockInMask) {
2246 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2247 auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2248 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2249 Value *ShuffledMask = Builder.CreateShuffleVector(
2250 BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2251 GroupMask = MaskForGaps
2252 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2253 MaskForGaps)
2254 : ShuffledMask;
2255 }
2256 NewLoad =
2257 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2258 GroupMask, UndefVec, "wide.masked.vec");
2259 }
2260 else
2261 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2262 Group->getAlign(), "wide.vec");
2263 Group->addMetadata(NewLoad);
2264 NewLoads.push_back(NewLoad);
2265 }
2266
2267 // For each member in the group, shuffle out the appropriate data from the
2268 // wide loads.
2269 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2270 Instruction *Member = Group->getMember(I);
2271
2272 // Skip the gaps in the group.
2273 if (!Member)
2274 continue;
2275
2276 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2277 for (unsigned Part = 0; Part < UF; Part++) {
2278 Value *StridedVec = Builder.CreateShuffleVector(
2279 NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2280
2281 // If this member has different type, cast the result type.
2282 if (Member->getType() != ScalarTy) {
2283 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2284 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2285 }
2286
2287 if (Group->isReverse())
2288 StridedVec = reverseVector(StridedVec);
2289
2290 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2291 }
2292 }
2293 return;
2294 }
2295
2296 // The sub vector type for current instruction.
2297 VectorType *SubVT = VectorType::get(ScalarTy, VF);
2298
2299 // Vectorize the interleaved store group.
2300 for (unsigned Part = 0; Part < UF; Part++) {
2301 // Collect the stored vector from each member.
2302 SmallVector<Value *, 4> StoredVecs;
2303 for (unsigned i = 0; i < InterleaveFactor; i++) {
2304 // Interleaved store group doesn't allow a gap, so each index has a member
2305 Instruction *Member = Group->getMember(i);
2306 assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2306, __PRETTY_FUNCTION__))
;
2307
2308 Value *StoredVec = getOrCreateVectorValue(
2309 cast<StoreInst>(Member)->getValueOperand(), Part);
2310 if (Group->isReverse())
2311 StoredVec = reverseVector(StoredVec);
2312
2313 // If this member has different type, cast it to a unified type.
2314
2315 if (StoredVec->getType() != SubVT)
2316 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2317
2318 StoredVecs.push_back(StoredVec);
2319 }
2320
2321 // Concatenate all vectors into a wide vector.
2322 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2323
2324 // Interleave the elements in the wide vector.
2325 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2326 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2327 "interleaved.vec");
2328
2329 Instruction *NewStoreInstr;
2330 if (BlockInMask) {
2331 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2332 auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2333 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2334 Value *ShuffledMask = Builder.CreateShuffleVector(
2335 BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2336 NewStoreInstr = Builder.CreateMaskedStore(
2337 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2338 }
2339 else
2340 NewStoreInstr =
2341 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2342
2343 Group->addMetadata(NewStoreInstr);
2344 }
2345}
2346
2347void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2348 VPTransformState &State,
2349 VPValue *Addr,
2350 VPValue *BlockInMask) {
2351 // Attempt to issue a wide load.
2352 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2353 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2354
2355 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2355, __PRETTY_FUNCTION__))
;
2356
2357 LoopVectorizationCostModel::InstWidening Decision =
2358 Cost->getWideningDecision(Instr, VF);
2359 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2360, __PRETTY_FUNCTION__))
2360 "CM decision should be taken at this point")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2360, __PRETTY_FUNCTION__))
;
2361 if (Decision == LoopVectorizationCostModel::CM_Interleave)
2362 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2363
2364 Type *ScalarDataTy = getMemInstValueType(Instr);
2365 Type *DataTy = VectorType::get(ScalarDataTy, VF);
2366 // An alignment of 0 means target abi alignment. We need to use the scalar's
2367 // target abi alignment in such a case.
2368 const DataLayout &DL = Instr->getModule()->getDataLayout();
2369 const Align Alignment =
2370 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2371
2372 // Determine if the pointer operand of the access is either consecutive or
2373 // reverse consecutive.
2374 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2375 bool ConsecutiveStride =
2376 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2377 bool CreateGatherScatter =
2378 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2379
2380 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2381 // gather/scatter. Otherwise Decision should have been to Scalarize.
2382 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__))
2383 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__))
;
2384 (void)ConsecutiveStride;
2385
2386 VectorParts BlockInMaskParts(UF);
2387 bool isMaskRequired = BlockInMask;
2388 if (isMaskRequired)
2389 for (unsigned Part = 0; Part < UF; ++Part)
2390 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2391
2392 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2393 // Calculate the pointer for the specific unroll-part.
2394 GetElementPtrInst *PartPtr = nullptr;
2395
2396 bool InBounds = false;
2397 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2398 InBounds = gep->isInBounds();
2399
2400 if (Reverse) {
2401 // If the address is consecutive but reversed, then the
2402 // wide store needs to start at the last vector element.
2403 PartPtr = cast<GetElementPtrInst>(
2404 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2405 PartPtr->setIsInBounds(InBounds);
2406 PartPtr = cast<GetElementPtrInst>(
2407 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2408 PartPtr->setIsInBounds(InBounds);
2409 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2410 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2411 } else {
2412 PartPtr = cast<GetElementPtrInst>(
2413 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2414 PartPtr->setIsInBounds(InBounds);
2415 }
2416
2417 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2418 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2419 };
2420
2421 // Handle Stores:
2422 if (SI) {
2423 setDebugLocFromInst(Builder, SI);
2424
2425 for (unsigned Part = 0; Part < UF; ++Part) {
2426 Instruction *NewSI = nullptr;
2427 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2428 if (CreateGatherScatter) {
2429 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2430 Value *VectorGep = State.get(Addr, Part);
2431 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2432 MaskPart);
2433 } else {
2434 if (Reverse) {
2435 // If we store to reverse consecutive memory locations, then we need
2436 // to reverse the order of elements in the stored value.
2437 StoredVal = reverseVector(StoredVal);
2438 // We don't want to update the value in the map as it might be used in
2439 // another expression. So don't call resetVectorValue(StoredVal).
2440 }
2441 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2442 if (isMaskRequired)
2443 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2444 BlockInMaskParts[Part]);
2445 else
2446 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2447 }
2448 addMetadata(NewSI, SI);
2449 }
2450 return;
2451 }
2452
2453 // Handle loads.
2454 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2454, __PRETTY_FUNCTION__))
;
2455 setDebugLocFromInst(Builder, LI);
2456 for (unsigned Part = 0; Part < UF; ++Part) {
2457 Value *NewLI;
2458 if (CreateGatherScatter) {
2459 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2460 Value *VectorGep = State.get(Addr, Part);
2461 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2462 nullptr, "wide.masked.gather");
2463 addMetadata(NewLI, LI);
2464 } else {
2465 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2466 if (isMaskRequired)
2467 NewLI = Builder.CreateMaskedLoad(
2468 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2469 "wide.masked.load");
2470 else
2471 NewLI =
2472 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2473
2474 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475 addMetadata(NewLI, LI);
2476 if (Reverse)
2477 NewLI = reverseVector(NewLI);
2478 }
2479 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2480 }
2481}
2482
2483void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2484 const VPIteration &Instance,
2485 bool IfPredicateInstr) {
2486 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2486, __PRETTY_FUNCTION__))
;
2487
2488 setDebugLocFromInst(Builder, Instr);
2489
2490 // Does this instruction return a value ?
2491 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2492
2493 Instruction *Cloned = Instr->clone();
2494 if (!IsVoidRetTy)
2495 Cloned->setName(Instr->getName() + ".cloned");
2496
2497 // Replace the operands of the cloned instructions with their scalar
2498 // equivalents in the new loop.
2499 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2500 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2501 Cloned->setOperand(op, NewOp);
2502 }
2503 addNewMetadata(Cloned, Instr);
2504
2505 // Place the cloned scalar in the new loop.
2506 Builder.Insert(Cloned);
2507
2508 // Add the cloned scalar to the scalar map entry.
2509 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2510
2511 // If we just cloned a new assumption, add it the assumption cache.
2512 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2513 if (II->getIntrinsicID() == Intrinsic::assume)
2514 AC->registerAssumption(II);
2515
2516 // End if-block.
2517 if (IfPredicateInstr)
2518 PredicatedInstructions.push_back(Cloned);
2519}
2520
2521PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2522 Value *End, Value *Step,
2523 Instruction *DL) {
2524 BasicBlock *Header = L->getHeader();
2525 BasicBlock *Latch = L->getLoopLatch();
2526 // As we're just creating this loop, it's possible no latch exists
2527 // yet. If so, use the header as this will be a single block loop.
2528 if (!Latch)
2529 Latch = Header;
2530
2531 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2532 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2533 setDebugLocFromInst(Builder, OldInst);
2534 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2535
2536 Builder.SetInsertPoint(Latch->getTerminator());
2537 setDebugLocFromInst(Builder, OldInst);
2538
2539 // Create i+1 and fill the PHINode.
2540 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2541 Induction->addIncoming(Start, L->getLoopPreheader());
2542 Induction->addIncoming(Next, Latch);
2543 // Create the compare.
2544 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2545 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2546
2547 // Now we have two terminators. Remove the old one from the block.
2548 Latch->getTerminator()->eraseFromParent();
2549
2550 return Induction;
2551}
2552
2553Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2554 if (TripCount)
2555 return TripCount;
2556
2557 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2557, __PRETTY_FUNCTION__))
;
2558 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2559 // Find the loop boundaries.
2560 ScalarEvolution *SE = PSE.getSE();
2561 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2562 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2563, __PRETTY_FUNCTION__))
2563 "Invalid loop count")((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2563, __PRETTY_FUNCTION__))
;
2564
2565 Type *IdxTy = Legal->getWidestInductionType();
2566 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2566, __PRETTY_FUNCTION__))
;
2567
2568 // The exit count might have the type of i64 while the phi is i32. This can
2569 // happen if we have an induction variable that is sign extended before the
2570 // compare. The only way that we get a backedge taken count is that the
2571 // induction variable was signed and as such will not overflow. In such a case
2572 // truncation is legal.
2573 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2574 IdxTy->getPrimitiveSizeInBits())
2575 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2576 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2577
2578 // Get the total trip count from the count by adding 1.
2579 const SCEV *ExitCount = SE->getAddExpr(
2580 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2581
2582 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2583
2584 // Expand the trip count and place the new instructions in the preheader.
2585 // Notice that the pre-header does not change, only the loop body.
2586 SCEVExpander Exp(*SE, DL, "induction");
2587
2588 // Count holds the overall loop count (N).
2589 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2590 L->getLoopPreheader()->getTerminator());
2591
2592 if (TripCount->getType()->isPointerTy())
2593 TripCount =
2594 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2595 L->getLoopPreheader()->getTerminator());
2596
2597 return TripCount;
2598}
2599
2600Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2601 if (VectorTripCount)
2602 return VectorTripCount;
2603
2604 Value *TC = getOrCreateTripCount(L);
2605 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2606
2607 Type *Ty = TC->getType();
2608 Constant *Step = ConstantInt::get(Ty, VF * UF);
2609
2610 // If the tail is to be folded by masking, round the number of iterations N
2611 // up to a multiple of Step instead of rounding down. This is done by first
2612 // adding Step-1 and then rounding down. Note that it's ok if this addition
2613 // overflows: the vector induction variable will eventually wrap to zero given
2614 // that it starts at zero and its Step is a power of two; the loop will then
2615 // exit, with the last early-exit vector comparison also producing all-true.
2616 if (Cost->foldTailByMasking()) {
2617 assert(isPowerOf2_32(VF * UF) &&((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2618, __PRETTY_FUNCTION__))
2618 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2618, __PRETTY_FUNCTION__))
;
2619 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2620 }
2621
2622 // Now we need to generate the expression for the part of the loop that the
2623 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624 // iterations are not required for correctness, or N - Step, otherwise. Step
2625 // is equal to the vectorization factor (number of SIMD elements) times the
2626 // unroll factor (number of SIMD instructions).
2627 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2628
2629 // If there is a non-reversed interleaved group that may speculatively access
2630 // memory out-of-bounds, we need to ensure that there will be at least one
2631 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632 // the trip count, we set the remainder to be equal to the step. If the step
2633 // does not evenly divide the trip count, no adjustment is necessary since
2634 // there will already be scalar iterations. Note that the minimum iterations
2635 // check ensures that N >= Step.
2636 if (VF > 1 && Cost->requiresScalarEpilogue()) {
2637 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2638 R = Builder.CreateSelect(IsZero, Step, R);
2639 }
2640
2641 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2642
2643 return VectorTripCount;
2644}
2645
2646Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2647 const DataLayout &DL) {
2648 // Verify that V is a vector type with same number of elements as DstVTy.
2649 unsigned VF = DstVTy->getNumElements();
2650 VectorType *SrcVecTy = cast<VectorType>(V->getType());
2651 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2651, __PRETTY_FUNCTION__))
;
2652 Type *SrcElemTy = SrcVecTy->getElementType();
2653 Type *DstElemTy = DstVTy->getElementType();
2654 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2655, __PRETTY_FUNCTION__))
2655 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2655, __PRETTY_FUNCTION__))
;
2656
2657 // Do a direct cast if element types are castable.
2658 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2659 return Builder.CreateBitOrPointerCast(V, DstVTy);
2660 }
2661 // V cannot be directly casted to desired vector type.
2662 // May happen when V is a floating point vector but DstVTy is a vector of
2663 // pointers or vice-versa. Handle this using a two-step bitcast using an
2664 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2666, __PRETTY_FUNCTION__))
2666 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2666, __PRETTY_FUNCTION__))
;
2667 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2668, __PRETTY_FUNCTION__))
2668 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2668, __PRETTY_FUNCTION__))
;
2669 Type *IntTy =
2670 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2671 VectorType *VecIntTy = VectorType::get(IntTy, VF);
2672 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2673 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674}
2675
2676void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2677 BasicBlock *Bypass) {
2678 Value *Count = getOrCreateTripCount(L);
2679 // Reuse existing vector loop preheader for TC checks.
2680 // Note that new preheader block is generated for vector loop.
2681 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2682 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2683
2684 // Generate code to check if the loop's trip count is less than VF * UF, or
2685 // equal to it in case a scalar epilogue is required; this implies that the
2686 // vector trip count is zero. This check also covers the case where adding one
2687 // to the backedge-taken count overflowed leading to an incorrect trip count
2688 // of zero. In this case we will also jump to the scalar loop.
2689 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2690 : ICmpInst::ICMP_ULT;
2691
2692 // If tail is to be folded, vector loop takes care of all iterations.
2693 Value *CheckMinIters = Builder.getFalse();
2694 if (!Cost->foldTailByMasking())
2695 CheckMinIters = Builder.CreateICmp(
2696 P, Count, ConstantInt::get(Count->getType(), VF * UF),
2697 "min.iters.check");
2698
2699 // Create new preheader for vector loop.
2700 LoopVectorPreHeader =
2701 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2702 "vector.ph");
2703
2704 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
2705 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
2706 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
;
2707
2708 // Update dominator for Bypass & LoopExit.
2709 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2710 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2711
2712 ReplaceInstWithInst(
2713 TCCheckBlock->getTerminator(),
2714 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2715 LoopBypassBlocks.push_back(TCCheckBlock);
2716}
2717
2718void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2719 // Reuse existing vector loop preheader for SCEV checks.
2720 // Note that new preheader block is generated for vector loop.
2721 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2722
2723 // Generate the code to check that the SCEV assumptions that we made.
2724 // We want the new basic block to start at the first instruction in a
2725 // sequence of instructions that form a check.
2726 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2727 "scev.check");
2728 Value *SCEVCheck = Exp.expandCodeForPredicate(
2729 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2730
2731 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2732 if (C->isZero())
2733 return;
2734
2735 assert(!SCEVCheckBlock->getParent()->hasOptSize() &&((!SCEVCheckBlock->getParent()->hasOptSize() &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!SCEVCheckBlock->getParent()->hasOptSize() && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2736, __PRETTY_FUNCTION__))
2736 "Cannot SCEV check stride or overflow when optimizing for size")((!SCEVCheckBlock->getParent()->hasOptSize() &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!SCEVCheckBlock->getParent()->hasOptSize() && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2736, __PRETTY_FUNCTION__))
;
2737
2738 SCEVCheckBlock->setName("vector.scevcheck");
2739 // Create new preheader for vector loop.
2740 LoopVectorPreHeader =
2741 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2742 nullptr, "vector.ph");
2743
2744 // Update dominator only if this is first RT check.
2745 if (LoopBypassBlocks.empty()) {
2746 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2747 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2748 }
2749
2750 ReplaceInstWithInst(
2751 SCEVCheckBlock->getTerminator(),
2752 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2753 LoopBypassBlocks.push_back(SCEVCheckBlock);
2754 AddedSafetyChecks = true;
2755}
2756
2757void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2758 // VPlan-native path does not do any analysis for runtime checks currently.
2759 if (EnableVPlanNativePath)
2760 return;
2761
2762 // Reuse existing vector loop preheader for runtime memory checks.
2763 // Note that new preheader block is generated for vector loop.
2764 BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2765
2766 // Generate the code that checks in runtime if arrays overlap. We put the
2767 // checks into a separate block to make the more common case of few elements
2768 // faster.
2769 Instruction *FirstCheckInst;
2770 Instruction *MemRuntimeCheck;
2771 std::tie(FirstCheckInst, MemRuntimeCheck) =
2772 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2773 if (!MemRuntimeCheck)
2774 return;
2775
2776 if (MemCheckBlock->getParent()->hasOptSize()) {
2777 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
2778 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
2779 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
;
2780 ORE->emit([&]() {
2781 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
2782 L->getStartLoc(), L->getHeader())
2783 << "Code-size may be reduced by not forcing "
2784 "vectorization, or by source-code modifications "
2785 "eliminating the need for runtime checks "
2786 "(e.g., adding 'restrict').";
2787 });
2788 }
2789
2790 MemCheckBlock->setName("vector.memcheck");
2791 // Create new preheader for vector loop.
2792 LoopVectorPreHeader =
2793 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2794 "vector.ph");
2795
2796 // Update dominator only if this is first RT check.
2797 if (LoopBypassBlocks.empty()) {
2798 DT->changeImmediateDominator(Bypass, MemCheckBlock);
2799 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2800 }
2801
2802 ReplaceInstWithInst(
2803 MemCheckBlock->getTerminator(),
2804 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2805 LoopBypassBlocks.push_back(MemCheckBlock);
2806 AddedSafetyChecks = true;
2807
2808 // We currently don't use LoopVersioning for the actual loop cloning but we
2809 // still use it to add the noalias metadata.
2810 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2811 PSE.getSE());
2812 LVer->prepareNoAliasMetadata();
2813}
2814
2815Value *InnerLoopVectorizer::emitTransformedIndex(
2816 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2817 const InductionDescriptor &ID) const {
2818
2819 SCEVExpander Exp(*SE, DL, "induction");
2820 auto Step = ID.getStep();
2821 auto StartValue = ID.getStartValue();
2822 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
2823 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
;
2824
2825 // Note: the IR at this point is broken. We cannot use SE to create any new
2826 // SCEV and then expand it, hoping that SCEV's simplification will give us
2827 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2828 // lead to various SCEV crashes. So all we can do is to use builder and rely
2829 // on InstCombine for future simplifications. Here we handle some trivial
2830 // cases only.
2831 auto CreateAdd = [&B](Value *X, Value *Y) {
2832 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2832, __PRETTY_FUNCTION__))
;
2833 if (auto *CX = dyn_cast<ConstantInt>(X))
2834 if (CX->isZero())
2835 return Y;
2836 if (auto *CY = dyn_cast<ConstantInt>(Y))
2837 if (CY->isZero())
2838 return X;
2839 return B.CreateAdd(X, Y);
2840 };
2841
2842 auto CreateMul = [&B](Value *X, Value *Y) {
2843 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2843, __PRETTY_FUNCTION__))
;
2844 if (auto *CX = dyn_cast<ConstantInt>(X))
2845 if (CX->isOne())
2846 return Y;
2847 if (auto *CY = dyn_cast<ConstantInt>(Y))
2848 if (CY->isOne())
2849 return X;
2850 return B.CreateMul(X, Y);
2851 };
2852
2853 switch (ID.getKind()) {
2854 case InductionDescriptor::IK_IntInduction: {
2855 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
2856 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
;
2857 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2858 return B.CreateSub(StartValue, Index);
2859 auto *Offset = CreateMul(
2860 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2861 return CreateAdd(StartValue, Offset);
2862 }
2863 case InductionDescriptor::IK_PtrInduction: {
2864 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2865, __PRETTY_FUNCTION__))
2865 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2865, __PRETTY_FUNCTION__))
;
2866 return B.CreateGEP(
2867 StartValue->getType()->getPointerElementType(), StartValue,
2868 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2869 &*B.GetInsertPoint())));
2870 }
2871 case InductionDescriptor::IK_FpInduction: {
2872 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2872, __PRETTY_FUNCTION__))
;
2873 auto InductionBinOp = ID.getInductionBinOp();
2874 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
2875 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
2876 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
2877 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
;
2878
2879 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2880
2881 // Floating point operations had to be 'fast' to enable the induction.
2882 FastMathFlags Flags;
2883 Flags.setFast();
2884
2885 Value *MulExp = B.CreateFMul(StepValue, Index);
2886 if (isa<Instruction>(MulExp))
2887 // We have to check, the MulExp may be a constant.
2888 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2889
2890 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2891 "induction");
2892 if (isa<Instruction>(BOp))
2893 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2894
2895 return BOp;
2896 }
2897 case InductionDescriptor::IK_NoInduction:
2898 return nullptr;
2899 }
2900 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2900)
;
2901}
2902
2903BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2904 /*
2905 In this function we generate a new loop. The new loop will contain
2906 the vectorized instructions while the old loop will continue to run the
2907 scalar remainder.
2908
2909 [ ] <-- loop iteration number check.
2910 / |
2911 / v
2912 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2913 | / |
2914 | / v
2915 || [ ] <-- vector pre header.
2916 |/ |
2917 | v
2918 | [ ] \
2919 | [ ]_| <-- vector loop.
2920 | |
2921 | v
2922 | -[ ] <--- middle-block.
2923 | / |
2924 | / v
2925 -|- >[ ] <--- new preheader.
2926 | |
2927 | v
2928 | [ ] \
2929 | [ ]_| <-- old scalar loop to handle remainder.
2930 \ |
2931 \ v
2932 >[ ] <-- exit block.
2933 ...
2934 */
2935
2936 MDNode *OrigLoopID = OrigLoop->getLoopID();
2937
2938 // Some loops have a single integer induction variable, while other loops
2939 // don't. One example is c++ iterators that often have multiple pointer
2940 // induction variables. In the code below we also support a case where we
2941 // don't have a single induction variable.
2942 //
2943 // We try to obtain an induction variable from the original loop as hard
2944 // as possible. However if we don't find one that:
2945 // - is an integer
2946 // - counts from zero, stepping by one
2947 // - is the size of the widest induction variable type
2948 // then we create a new one.
2949 OldInduction = Legal->getPrimaryInduction();
2950 Type *IdxTy = Legal->getWidestInductionType();
2951
2952 // Split the single block loop into the two loop structure described above.
2953 LoopScalarBody = OrigLoop->getHeader();
2954 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2955 LoopExitBlock = OrigLoop->getExitBlock();
2956 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2956, __PRETTY_FUNCTION__))
;
2957 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2957, __PRETTY_FUNCTION__))
;
2958
2959 LoopMiddleBlock =
2960 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2961 LI, nullptr, "middle.block");
2962 LoopScalarPreHeader =
2963 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2964 nullptr, "scalar.ph");
2965 // We intentionally don't let SplitBlock to update LoopInfo since
2966 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2967 // LoopVectorBody is explicitly added to the correct place few lines later.
2968 LoopVectorBody =
2969 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2970 nullptr, nullptr, "vector.body");
2971
2972 // Update dominator for loop exit.
2973 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2974
2975 // Create and register the new vector loop.
2976 Loop *Lp = LI->AllocateLoop();
2977 Loop *ParentLoop = OrigLoop->getParentLoop();
2978
2979 // Insert the new loop into the loop nest and register the new basic blocks
2980 // before calling any utilities such as SCEV that require valid LoopInfo.
2981 if (ParentLoop) {
2982 ParentLoop->addChildLoop(Lp);
2983 } else {
2984 LI->addTopLevelLoop(Lp);
2985 }
2986 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2987
2988 // Find the loop boundaries.
2989 Value *Count = getOrCreateTripCount(Lp);
2990
2991 Value *StartIdx = ConstantInt::get(IdxTy, 0);
2992
2993 // Now, compare the new count to zero. If it is zero skip the vector loop and
2994 // jump to the scalar loop. This check also covers the case where the
2995 // backedge-taken count is uint##_max: adding one to it will overflow leading
2996 // to an incorrect trip count of zero. In this (rare) case we will also jump
2997 // to the scalar loop.
2998 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
2999
3000 // Generate the code to check any assumptions that we've made for SCEV
3001 // expressions.
3002 emitSCEVChecks(Lp, LoopScalarPreHeader);
3003
3004 // Generate the code that checks in runtime if arrays overlap. We put the
3005 // checks into a separate block to make the more common case of few elements
3006 // faster.
3007 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3008
3009 // Generate the induction variable.
3010 // The loop step is equal to the vectorization factor (num of SIMD elements)
3011 // times the unroll factor (num of SIMD instructions).
3012 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3013 Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3014 Induction =
3015 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3016 getDebugLocFromInstOrOperands(OldInduction));
3017
3018 // We are going to resume the execution of the scalar loop.
3019 // Go over all of the induction variables that we found and fix the
3020 // PHIs that are left in the scalar version of the loop.
3021 // The starting values of PHI nodes depend on the counter of the last
3022 // iteration in the vectorized loop.
3023 // If we come from a bypass edge then we need to start from the original
3024 // start value.
3025
3026 // This variable saves the new starting index for the scalar loop. It is used
3027 // to test if there are any tail iterations left once the vector loop has
3028 // completed.
3029 for (auto &InductionEntry : Legal->getInductionVars()) {
3030 PHINode *OrigPhi = InductionEntry.first;
3031 InductionDescriptor II = InductionEntry.second;
3032
3033 // Create phi nodes to merge from the backedge-taken check block.
3034 PHINode *BCResumeVal =
3035 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3036 LoopScalarPreHeader->getTerminator());
3037 // Copy original phi DL over to the new one.
3038 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3039 Value *&EndValue = IVEndValues[OrigPhi];
3040 if (OrigPhi == OldInduction) {
3041 // We know what the end value is.
3042 EndValue = CountRoundDown;
3043 } else {
3044 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3045 Type *StepType = II.getStep()->getType();
3046 Instruction::CastOps CastOp =
3047 CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3048 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3049 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3050 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3051 EndValue->setName("ind.end");
3052 }
3053
3054 // The new PHI merges the original incoming value, in case of a bypass,
3055 // or the value at the end of the vectorized loop.
3056 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3057
3058 // Fix the scalar body counter (PHI node).
3059 // The old induction's phi node in the scalar body needs the truncated
3060 // value.
3061 for (BasicBlock *BB : LoopBypassBlocks)
3062 BCResumeVal->addIncoming(II.getStartValue(), BB);
3063 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3064 }
3065
3066 // We need the OrigLoop (scalar loop part) latch terminator to help
3067 // produce correct debug info for the middle block BB instructions.
3068 // The legality check stage guarantees that the loop will have a single
3069 // latch.
3070 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__))
3071 "Scalar loop latch terminator isn't a branch")((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__))
;
3072 BranchInst *ScalarLatchBr =
3073 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3074
3075 // Add a check in the middle block to see if we have completed
3076 // all of the iterations in the first vector loop.
3077 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3078 // If tail is to be folded, we know we don't need to run the remainder.
3079 Value *CmpN = Builder.getTrue();
3080 if (!Cost->foldTailByMasking()) {
3081 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3082 CountRoundDown, "cmp.n",
3083 LoopMiddleBlock->getTerminator());
3084
3085 // Here we use the same DebugLoc as the scalar loop latch branch instead
3086 // of the corresponding compare because they may have ended up with
3087 // different line numbers and we want to avoid awkward line stepping while
3088 // debugging. Eg. if the compare has got a line number inside the loop.
3089 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3090 }
3091
3092 BranchInst *BrInst =
3093 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3094 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3095 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3096
3097 // Get ready to start creating new instructions into the vectorized body.
3098 assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&((LoopVectorPreHeader == Lp->getLoopPreheader() &&
"Inconsistent vector loop preheader") ? static_cast<void>
(0) : __assert_fail ("LoopVectorPreHeader == Lp->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3099, __PRETTY_FUNCTION__))
3099 "Inconsistent vector loop preheader")((LoopVectorPreHeader == Lp->getLoopPreheader() &&
"Inconsistent vector loop preheader") ? static_cast<void>
(0) : __assert_fail ("LoopVectorPreHeader == Lp->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3099, __PRETTY_FUNCTION__))
;
3100 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3101
3102 Optional<MDNode *> VectorizedLoopID =
3103 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3104 LLVMLoopVectorizeFollowupVectorized});
3105 if (VectorizedLoopID.hasValue()) {
3106 Lp->setLoopID(VectorizedLoopID.getValue());
3107
3108 // Do not setAlreadyVectorized if loop attributes have been defined
3109 // explicitly.
3110 return LoopVectorPreHeader;
3111 }
3112
3113 // Keep all loop hints from the original loop on the vector loop (we'll
3114 // replace the vectorizer-specific hints below).
3115 if (MDNode *LID = OrigLoop->getLoopID())
3116 Lp->setLoopID(LID);
3117
3118 LoopVectorizeHints Hints(Lp, true, *ORE);
3119 Hints.setAlreadyVectorized();
3120
3121#ifdef EXPENSIVE_CHECKS
3122 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3122, __PRETTY_FUNCTION__))
;
3123 LI->verify(*DT);
3124#endif
3125
3126 return LoopVectorPreHeader;
3127}
3128
3129// Fix up external users of the induction variable. At this point, we are
3130// in LCSSA form, with all external PHIs that use the IV having one input value,
3131// coming from the remainder loop. We need those PHIs to also have a correct
3132// value for the IV when arriving directly from the middle block.
3133void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3134 const InductionDescriptor &II,
3135 Value *CountRoundDown, Value *EndValue,
3136 BasicBlock *MiddleBlock) {
3137 // There are two kinds of external IV usages - those that use the value
3138 // computed in the last iteration (the PHI) and those that use the penultimate
3139 // value (the value that feeds into the phi from the loop latch).
3140 // We allow both, but they, obviously, have different values.
3141
3142 assert(OrigLoop->getExitBlock() && "Expected a single exit block")((OrigLoop->getExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3142, __PRETTY_FUNCTION__))
;
3143
3144 DenseMap<Value *, Value *> MissingVals;
3145
3146 // An external user of the last iteration's value should see the value that
3147 // the remainder loop uses to initialize its own IV.
3148 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3149 for (User *U : PostInc->users()) {
3150 Instruction *UI = cast<Instruction>(U);
3151 if (!OrigLoop->contains(UI)) {
3152 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3152, __PRETTY_FUNCTION__))
;
3153 MissingVals[UI] = EndValue;
3154 }
3155 }
3156
3157 // An external user of the penultimate value need to see EndValue - Step.
3158 // The simplest way to get this is to recompute it from the constituent SCEVs,
3159 // that is Start + (Step * (CRD - 1)).
3160 for (User *U : OrigPhi->users()) {
3161 auto *UI = cast<Instruction>(U);
3162 if (!OrigLoop->contains(UI)) {
3163 const DataLayout &DL =
3164 OrigLoop->getHeader()->getModule()->getDataLayout();
3165 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3165, __PRETTY_FUNCTION__))
;
3166
3167 IRBuilder<> B(MiddleBlock->getTerminator());
3168 Value *CountMinusOne = B.CreateSub(
3169 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3170 Value *CMO =
3171 !II.getStep()->getType()->isIntegerTy()
3172 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3173 II.getStep()->getType())
3174 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3175 CMO->setName("cast.cmo");
3176 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3177 Escape->setName("ind.escape");
3178 MissingVals[UI] = Escape;
3179 }
3180 }
3181
3182 for (auto &I : MissingVals) {
3183 PHINode *PHI = cast<PHINode>(I.first);
3184 // One corner case we have to handle is two IVs "chasing" each-other,
3185 // that is %IV2 = phi [...], [ %IV1, %latch ]
3186 // In this case, if IV1 has an external use, we need to avoid adding both
3187 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3188 // don't already have an incoming value for the middle block.
3189 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3190 PHI->addIncoming(I.second, MiddleBlock);
3191 }
3192}
3193
3194namespace {
3195
3196struct CSEDenseMapInfo {
3197 static bool canHandle(const Instruction *I) {
3198 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3199 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3200 }
3201
3202 static inline Instruction *getEmptyKey() {
3203 return DenseMapInfo<Instruction *>::getEmptyKey();
3204 }
3205
3206 static inline Instruction *getTombstoneKey() {
3207 return DenseMapInfo<Instruction *>::getTombstoneKey();
3208 }
3209
3210 static unsigned getHashValue(const Instruction *I) {
3211 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3211, __PRETTY_FUNCTION__))
;
3212 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3213 I->value_op_end()));
3214 }
3215
3216 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3217 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3218 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3219 return LHS == RHS;
3220 return LHS->isIdenticalTo(RHS);
3221 }
3222};
3223
3224} // end anonymous namespace
3225
3226///Perform cse of induction variable instructions.
3227static void cse(BasicBlock *BB) {
3228 // Perform simple cse.
3229 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3230 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3231 Instruction *In = &*I++;
3232
3233 if (!CSEDenseMapInfo::canHandle(In))
3234 continue;
3235
3236 // Check if we can replace this instruction with any of the
3237 // visited instructions.
3238 if (Instruction *V = CSEMap.lookup(In)) {
3239 In->replaceAllUsesWith(V);
3240 In->eraseFromParent();
3241 continue;
3242 }
3243
3244 CSEMap[In] = In;
3245 }
3246}
3247
3248unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3249 unsigned VF,
3250 bool &NeedToScalarize) {
3251 Function *F = CI->getCalledFunction();
3252 Type *ScalarRetTy = CI->getType();
3253 SmallVector<Type *, 4> Tys, ScalarTys;
3254 for (auto &ArgOp : CI->arg_operands())
3255 ScalarTys.push_back(ArgOp->getType());
3256
3257 // Estimate cost of scalarized vector call. The source operands are assumed
3258 // to be vectors, so we need to extract individual elements from there,
3259 // execute VF scalar calls, and then gather the result into the vector return
3260 // value.
3261 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3262 if (VF == 1)
3263 return ScalarCallCost;
3264
3265 // Compute corresponding vector type for return value and arguments.
3266 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3267 for (Type *ScalarTy : ScalarTys)
3268 Tys.push_back(ToVectorTy(ScalarTy, VF));
3269
3270 // Compute costs of unpacking argument values for the scalar calls and
3271 // packing the return values to a vector.
3272 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3273
3274 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3275
3276 // If we can't emit a vector call for this function, then the currently found
3277 // cost is the cost we need to return.
3278 NeedToScalarize = true;
3279 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3280 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3281
3282 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3283 return Cost;
3284
3285 // If the corresponding vector cost is cheaper, return its cost.
3286 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3287 if (VectorCallCost < Cost) {
3288 NeedToScalarize = false;
3289 return VectorCallCost;
3290 }
3291 return Cost;
3292}
3293
3294unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3295 unsigned VF) {
3296 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3297 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3297, __PRETTY_FUNCTION__))
;
3298
3299 FastMathFlags FMF;
3300 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3301 FMF = FPMO->getFastMathFlags();
3302
3303 SmallVector<Value *, 4> Operands(CI->arg_operands());
3304 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3305}
3306
3307static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3308 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3309 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3310 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3311}
3312static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3313 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3314 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3315 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3316}
3317
3318void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3319 // For every instruction `I` in MinBWs, truncate the operands, create a
3320 // truncated version of `I` and reextend its result. InstCombine runs
3321 // later and will remove any ext/trunc pairs.
3322 SmallPtrSet<Value *, 4> Erased;
3323 for (const auto &KV : Cost->getMinimalBitwidths()) {
3324 // If the value wasn't vectorized, we must maintain the original scalar
3325 // type. The absence of the value from VectorLoopValueMap indicates that it
3326 // wasn't vectorized.
3327 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3328 continue;
3329 for (unsigned Part = 0; Part < UF; ++Part) {
3330 Value *I = getOrCreateVectorValue(KV.first, Part);
3331 if (Erased.find(I) != Erased.end() || I->use_empty() ||
3332 !isa<Instruction>(I))
3333 continue;
3334 Type *OriginalTy = I->getType();
3335 Type *ScalarTruncatedTy =
3336 IntegerType::get(OriginalTy->getContext(), KV.second);
3337 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3338 OriginalTy->getVectorNumElements());
3339 if (TruncatedTy == OriginalTy)
3340 continue;
3341
3342 IRBuilder<> B(cast<Instruction>(I));
3343 auto ShrinkOperand = [&](Value *V) -> Value * {
3344 if (auto *ZI = dyn_cast<ZExtInst>(V))
3345 if (ZI->getSrcTy() == TruncatedTy)
3346 return ZI->getOperand(0);
3347 return B.CreateZExtOrTrunc(V, TruncatedTy);
3348 };
3349
3350 // The actual instruction modification depends on the instruction type,
3351 // unfortunately.
3352 Value *NewI = nullptr;
3353 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3354 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3355 ShrinkOperand(BO->getOperand(1)));
3356
3357 // Any wrapping introduced by shrinking this operation shouldn't be
3358 // considered undefined behavior. So, we can't unconditionally copy
3359 // arithmetic wrapping flags to NewI.
3360 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3361 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3362 NewI =
3363 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3364 ShrinkOperand(CI->getOperand(1)));
3365 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3366 NewI = B.CreateSelect(SI->getCondition(),
3367 ShrinkOperand(SI->getTrueValue()),
3368 ShrinkOperand(SI->getFalseValue()));
3369 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3370 switch (CI->getOpcode()) {
3371 default:
3372 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3372)
;
3373 case Instruction::Trunc:
3374 NewI = ShrinkOperand(CI->getOperand(0));
3375 break;
3376 case Instruction::SExt:
3377 NewI = B.CreateSExtOrTrunc(
3378 CI->getOperand(0),
3379 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3380 break;
3381 case Instruction::ZExt:
3382 NewI = B.CreateZExtOrTrunc(
3383 CI->getOperand(0),
3384 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3385 break;
3386 }
3387 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3388 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3389 auto *O0 = B.CreateZExtOrTrunc(
3390 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3391 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3392 auto *O1 = B.CreateZExtOrTrunc(
3393 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3394
3395 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3396 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3397 // Don't do anything with the operands, just extend the result.
3398 continue;
3399 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3400 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3401 auto *O0 = B.CreateZExtOrTrunc(
3402 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3403 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3404 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3405 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3406 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3407 auto *O0 = B.CreateZExtOrTrunc(
3408 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3409 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3410 } else {
3411 // If we don't know what to do, be conservative and don't do anything.
3412 continue;
3413 }
3414
3415 // Lastly, extend the result.
3416 NewI->takeName(cast<Instruction>(I));
3417 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3418 I->replaceAllUsesWith(Res);
3419 cast<Instruction>(I)->eraseFromParent();
3420 Erased.insert(I);
3421 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3422 }
3423 }
3424
3425 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3426 for (const auto &KV : Cost->getMinimalBitwidths()) {
3427 // If the value wasn't vectorized, we must maintain the original scalar
3428 // type. The absence of the value from VectorLoopValueMap indicates that it
3429 // wasn't vectorized.
3430 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3431 continue;
3432 for (unsigned Part = 0; Part < UF; ++Part) {
3433 Value *I = getOrCreateVectorValue(KV.first, Part);
3434 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3435 if (Inst && Inst->use_empty()) {
3436 Value *NewI = Inst->getOperand(0);
3437 Inst->eraseFromParent();
3438 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3439 }
3440 }
3441 }
3442}
3443
3444void InnerLoopVectorizer::fixVectorizedLoop() {
3445 // Insert truncates and extends for any truncated instructions as hints to
3446 // InstCombine.
3447 if (VF > 1)
3448 truncateToMinimalBitwidths();
3449
3450 // Fix widened non-induction PHIs by setting up the PHI operands.
3451 if (OrigPHIsToFix.size()) {
3452 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__))
3453 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__))
;
3454 fixNonInductionPHIs();
3455 }
3456
3457 // At this point every instruction in the original loop is widened to a
3458 // vector form. Now we need to fix the recurrences in the loop. These PHI
3459 // nodes are currently empty because we did not want to introduce cycles.
3460 // This is the second stage of vectorizing recurrences.
3461 fixCrossIterationPHIs();
3462
3463 // Forget the original basic block.
3464 PSE.getSE()->forgetLoop(OrigLoop);
3465
3466 // Fix-up external users of the induction variables.
3467 for (auto &Entry : Legal->getInductionVars())
3468 fixupIVUsers(Entry.first, Entry.second,
3469 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3470 IVEndValues[Entry.first], LoopMiddleBlock);
3471
3472 fixLCSSAPHIs();
3473 for (Instruction *PI : PredicatedInstructions)
3474 sinkScalarOperands(&*PI);
3475
3476 // Remove redundant induction instructions.
3477 cse(LoopVectorBody);
3478
3479 // Set/update profile weights for the vector and remainder loops as original
3480 // loop iterations are now distributed among them. Note that original loop
3481 // represented by LoopScalarBody becomes remainder loop after vectorization.
3482 //
3483 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3484 // end up getting slightly roughened result but that should be OK since
3485 // profile is not inherently precise anyway. Note also possible bypass of
3486 // vector code caused by legality checks is ignored, assigning all the weight
3487 // to the vector loop, optimistically.
3488 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3489 LI->getLoopFor(LoopVectorBody),
3490 LI->getLoopFor(LoopScalarBody), VF * UF);
3491}
3492
3493void InnerLoopVectorizer::fixCrossIterationPHIs() {
3494 // In order to support recurrences we need to be able to vectorize Phi nodes.
3495 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3496 // stage #2: We now need to fix the recurrences by adding incoming edges to
3497 // the currently empty PHI nodes. At this point every instruction in the
3498 // original loop is widened to a vector form so we can use them to construct
3499 // the incoming edges.
3500 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3501 // Handle first-order recurrences and reductions that need to be fixed.
3502 if (Legal->isFirstOrderRecurrence(&Phi))
3503 fixFirstOrderRecurrence(&Phi);
3504 else if (Legal->isReductionVariable(&Phi))
3505 fixReduction(&Phi);
3506 }
3507}
3508
3509void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3510 // This is the second phase of vectorizing first-order recurrences. An
3511 // overview of the transformation is described below. Suppose we have the
3512 // following loop.
3513 //
3514 // for (int i = 0; i < n; ++i)
3515 // b[i] = a[i] - a[i - 1];
3516 //
3517 // There is a first-order recurrence on "a". For this loop, the shorthand
3518 // scalar IR looks like:
3519 //
3520 // scalar.ph:
3521 // s_init = a[-1]
3522 // br scalar.body
3523 //
3524 // scalar.body:
3525 // i = phi [0, scalar.ph], [i+1, scalar.body]
3526 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3527 // s2 = a[i]
3528 // b[i] = s2 - s1
3529 // br cond, scalar.body, ...
3530 //
3531 // In this example, s1 is a recurrence because it's value depends on the
3532 // previous iteration. In the first phase of vectorization, we created a
3533 // temporary value for s1. We now complete the vectorization and produce the
3534 // shorthand vector IR shown below (for VF = 4, UF = 1).
3535 //
3536 // vector.ph:
3537 // v_init = vector(..., ..., ..., a[-1])
3538 // br vector.body
3539 //
3540 // vector.body
3541 // i = phi [0, vector.ph], [i+4, vector.body]
3542 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3543 // v2 = a[i, i+1, i+2, i+3];
3544 // v3 = vector(v1(3), v2(0, 1, 2))
3545 // b[i, i+1, i+2, i+3] = v2 - v3
3546 // br cond, vector.body, middle.block
3547 //
3548 // middle.block:
3549 // x = v2(3)
3550 // br scalar.ph
3551 //
3552 // scalar.ph:
3553 // s_init = phi [x, middle.block], [a[-1], otherwise]
3554 // br scalar.body
3555 //
3556 // After execution completes the vector loop, we extract the next value of
3557 // the recurrence (x) to use as the initial value in the scalar loop.
3558
3559 // Get the original loop preheader and single loop latch.
3560 auto *Preheader = OrigLoop->getLoopPreheader();
3561 auto *Latch = OrigLoop->getLoopLatch();
3562
3563 // Get the initial and previous values of the scalar recurrence.
3564 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3565 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3566
3567 // Create a vector from the initial value.
3568 auto *VectorInit = ScalarInit;
3569 if (VF > 1) {
3570 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3571 VectorInit = Builder.CreateInsertElement(
3572 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3573 Builder.getInt32(VF - 1), "vector.recur.init");
3574 }
3575
3576 // We constructed a temporary phi node in the first phase of vectorization.
3577 // This phi node will eventually be deleted.
3578 Builder.SetInsertPoint(
3579 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3580
3581 // Create a phi node for the new recurrence. The current value will either be
3582 // the initial value inserted into a vector or loop-varying vector value.
3583 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3584 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3585
3586 // Get the vectorized previous value of the last part UF - 1. It appears last
3587 // among all unrolled iterations, due to the order of their construction.
3588 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3589
3590 // Find and set the insertion point after the previous value if it is an
3591 // instruction.
3592 BasicBlock::iterator InsertPt;
3593 // Note that the previous value may have been constant-folded so it is not
3594 // guaranteed to be an instruction in the vector loop.
3595 // FIXME: Loop invariant values do not form recurrences. We should deal with
3596 // them earlier.
3597 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3598 InsertPt = LoopVectorBody->getFirstInsertionPt();
3599 else {
3600 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3601 if (isa<PHINode>(PreviousLastPart))
3602 // If the previous value is a phi node, we should insert after all the phi
3603 // nodes in the block containing the PHI to avoid breaking basic block
3604 // verification. Note that the basic block may be different to
3605 // LoopVectorBody, in case we predicate the loop.
3606 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3607 else
3608 InsertPt = ++PreviousInst->getIterator();
3609 }
3610 Builder.SetInsertPoint(&*InsertPt);
3611
3612 // We will construct a vector for the recurrence by combining the values for
3613 // the current and previous iterations. This is the required shuffle mask.
3614 SmallVector<Constant *, 8> ShuffleMask(VF);
3615 ShuffleMask[0] = Builder.getInt32(VF - 1);
3616 for (unsigned I = 1; I < VF; ++I)
3617 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3618
3619 // The vector from which to take the initial value for the current iteration
3620 // (actual or unrolled). Initially, this is the vector phi node.
3621 Value *Incoming = VecPhi;
3622
3623 // Shuffle the current and previous vector and update the vector parts.
3624 for (unsigned Part = 0; Part < UF; ++Part) {
3625 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3626 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3627 auto *Shuffle =
3628 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3629 ConstantVector::get(ShuffleMask))
3630 : Incoming;
3631 PhiPart->replaceAllUsesWith(Shuffle);
3632 cast<Instruction>(PhiPart)->eraseFromParent();
3633 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3634 Incoming = PreviousPart;
3635 }
3636
3637 // Fix the latch value of the new recurrence in the vector loop.
3638 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3639
3640 // Extract the last vector element in the middle block. This will be the
3641 // initial value for the recurrence when jumping to the scalar loop.
3642 auto *ExtractForScalar = Incoming;
3643 if (VF > 1) {
3644 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3645 ExtractForScalar = Builder.CreateExtractElement(
3646 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3647 }
3648 // Extract the second last element in the middle block if the
3649 // Phi is used outside the loop. We need to extract the phi itself
3650 // and not the last element (the phi update in the current iteration). This
3651 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3652 // when the scalar loop is not run at all.
3653 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3654 if (VF > 1)
3655 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3656 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3657 // When loop is unrolled without vectorizing, initialize
3658 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3659 // `Incoming`. This is analogous to the vectorized case above: extracting the
3660 // second last element when VF > 1.
3661 else if (UF > 1)
3662 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3663
3664 // Fix the initial value of the original recurrence in the scalar loop.
3665 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3666 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3667 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3668 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3669 Start->addIncoming(Incoming, BB);
3670 }
3671
3672 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3673 Phi->setName("scalar.recur");
3674
3675 // Finally, fix users of the recurrence outside the loop. The users will need
3676 // either the last value of the scalar recurrence or the last value of the
3677 // vector recurrence we extracted in the middle block. Since the loop is in
3678 // LCSSA form, we just need to find all the phi nodes for the original scalar
3679 // recurrence in the exit block, and then add an edge for the middle block.
3680 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3682 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3683 }
3684 }
3685}
3686
3687void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3688 Constant *Zero = Builder.getInt32(0);
3689
3690 // Get it's reduction variable descriptor.
3691 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3692, __PRETTY_FUNCTION__))
3692 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3692, __PRETTY_FUNCTION__))
;
3693 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3694
3695 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3696 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3697 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3698 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3699 RdxDesc.getMinMaxRecurrenceKind();
3700 setDebugLocFromInst(Builder, ReductionStartValue);
3701
3702 // We need to generate a reduction vector from the incoming scalar.
3703 // To do so, we need to generate the 'identity' vector and override
3704 // one of the elements with the incoming scalar reduction. We need
3705 // to do it in the vector-loop preheader.
3706 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3707
3708 // This is the vector-clone of the value that leaves the loop.
3709 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3710
3711 // Find the reduction identity variable. Zero for addition, or, xor,
3712 // one for multiplication, -1 for And.
3713 Value *Identity;
3714 Value *VectorStart;
3715 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3716 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3717 // MinMax reduction have the start value as their identify.
3718 if (VF == 1) {
3719 VectorStart = Identity = ReductionStartValue;
3720 } else {
3721 VectorStart = Identity =
3722 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3723 }
3724 } else {
3725 // Handle other reduction kinds:
3726 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3727 RK, VecTy->getScalarType());
3728 if (VF == 1) {
3729 Identity = Iden;
3730 // This vector is the Identity vector where the first element is the
3731 // incoming scalar reduction.
3732 VectorStart = ReductionStartValue;
3733 } else {
3734 Identity = ConstantVector::getSplat(VF, Iden);
3735
3736 // This vector is the Identity vector where the first element is the
3737 // incoming scalar reduction.
3738 VectorStart =
3739 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3740 }
3741 }
3742
3743 // Wrap flags are in general invalid after vectorization, clear them.
3744 clearReductionWrapFlags(RdxDesc);
3745
3746 // Fix the vector-loop phi.
3747
3748 // Reductions do not have to start at zero. They can start with
3749 // any loop invariant values.
3750 BasicBlock *Latch = OrigLoop->getLoopLatch();
3751 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3752
3753 for (unsigned Part = 0; Part < UF; ++Part) {
3754 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3755 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3756 // Make sure to add the reduction start value only to the
3757 // first unroll part.
3758 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3759 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3760 cast<PHINode>(VecRdxPhi)
3761 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3762 }
3763
3764 // Before each round, move the insertion point right between
3765 // the PHIs and the values we are going to write.
3766 // This allows us to write both PHINodes and the extractelement
3767 // instructions.
3768 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3769
3770 setDebugLocFromInst(Builder, LoopExitInst);
3771
3772 // If tail is folded by masking, the vector value to leave the loop should be
3773 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3774 // instead of the former.
3775 if (Cost->foldTailByMasking()) {
3776 for (unsigned Part = 0; Part < UF; ++Part) {
3777 Value *VecLoopExitInst =
3778 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3779 Value *Sel = nullptr;
3780 for (User *U : VecLoopExitInst->users()) {
3781 if (isa<SelectInst>(U)) {
3782 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3782, __PRETTY_FUNCTION__))
;
3783 Sel = U;
3784 } else
3785 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3785, __PRETTY_FUNCTION__))
;
3786 }
3787 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3787, __PRETTY_FUNCTION__))
;
3788 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3789 }
3790 }
3791
3792 // If the vector reduction can be performed in a smaller type, we truncate
3793 // then extend the loop exit value to enable InstCombine to evaluate the
3794 // entire expression in the smaller type.
3795 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3796 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3797 Builder.SetInsertPoint(
3798 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3799 VectorParts RdxParts(UF);
3800 for (unsigned Part = 0; Part < UF; ++Part) {
3801 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3802 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3803 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3804 : Builder.CreateZExt(Trunc, VecTy);
3805 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3806 UI != RdxParts[Part]->user_end();)
3807 if (*UI != Trunc) {
3808 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3809 RdxParts[Part] = Extnd;
3810 } else {
3811 ++UI;
3812 }
3813 }
3814 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3815 for (unsigned Part = 0; Part < UF; ++Part) {
3816 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3817 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3818 }
3819 }
3820
3821 // Reduce all of the unrolled parts into a single vector.
3822 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3823 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3824
3825 // The middle block terminator has already been assigned a DebugLoc here (the
3826 // OrigLoop's single latch terminator). We want the whole middle block to
3827 // appear to execute on this line because: (a) it is all compiler generated,
3828 // (b) these instructions are always executed after evaluating the latch
3829 // conditional branch, and (c) other passes may add new predecessors which
3830 // terminate on this line. This is the easiest way to ensure we don't
3831 // accidentally cause an extra step back into the loop while debugging.
3832 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3833 for (unsigned Part = 1; Part < UF; ++Part) {
3834 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3835 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3836 // Floating point operations had to be 'fast' to enable the reduction.
3837 ReducedPartRdx = addFastMathFlag(
3838 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3839 ReducedPartRdx, "bin.rdx"),
3840 RdxDesc.getFastMathFlags());
3841 else
3842 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3843 RdxPart);
3844 }
3845
3846 if (VF > 1) {
3847 bool NoNaN = Legal->hasFunNoNaNAttr();
3848 ReducedPartRdx =
3849 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3850 // If the reduction can be performed in a smaller type, we need to extend
3851 // the reduction to the wider type before we branch to the original loop.
3852 if (Phi->getType() != RdxDesc.getRecurrenceType())
3853 ReducedPartRdx =
3854 RdxDesc.isSigned()
3855 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3856 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3857 }
3858
3859 // Create a phi node that merges control-flow from the backedge-taken check
3860 // block and the middle block.
3861 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3862 LoopScalarPreHeader->getTerminator());
3863 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3864 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3865 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3866
3867 // Now, we need to fix the users of the reduction variable
3868 // inside and outside of the scalar remainder loop.
3869 // We know that the loop is in LCSSA form. We need to update the
3870 // PHI nodes in the exit blocks.
3871 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3872 // All PHINodes need to have a single entry edge, or two if
3873 // we already fixed them.
3874 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3874, __PRETTY_FUNCTION__))
;
3875
3876 // We found a reduction value exit-PHI. Update it with the
3877 // incoming bypass edge.
3878 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3879 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3880 } // end of the LCSSA phi scan.
3881
3882 // Fix the scalar loop reduction variable with the incoming reduction sum
3883 // from the vector body and from the backedge value.
3884 int IncomingEdgeBlockIdx =
3885 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3886 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3886, __PRETTY_FUNCTION__))
;
3887 // Pick the other block.
3888 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3889 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3890 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3891}
3892
3893void InnerLoopVectorizer::clearReductionWrapFlags(
3894 RecurrenceDescriptor &RdxDesc) {
3895 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3896 if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3897 RK != RecurrenceDescriptor::RK_IntegerMult)
3898 return;
3899
3900 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3901 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3901, __PRETTY_FUNCTION__))
;
3902 SmallVector<Instruction *, 8> Worklist;
3903 SmallPtrSet<Instruction *, 8> Visited;
3904 Worklist.push_back(LoopExitInstr);
3905 Visited.insert(LoopExitInstr);
3906
3907 while (!Worklist.empty()) {
3908 Instruction *Cur = Worklist.pop_back_val();
3909 if (isa<OverflowingBinaryOperator>(Cur))
3910 for (unsigned Part = 0; Part < UF; ++Part) {
3911 Value *V = getOrCreateVectorValue(Cur, Part);
3912 cast<Instruction>(V)->dropPoisonGeneratingFlags();
3913 }
3914
3915 for (User *U : Cur->users()) {
3916 Instruction *UI = cast<Instruction>(U);
3917 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3918 Visited.insert(UI).second)
3919 Worklist.push_back(UI);
3920 }
3921 }
3922}
3923
3924void InnerLoopVectorizer::fixLCSSAPHIs() {
3925 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3926 if (LCSSAPhi.getNumIncomingValues() == 1) {
3927 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3928 // Non-instruction incoming values will have only one value.
3929 unsigned LastLane = 0;
3930 if (isa<Instruction>(IncomingValue))
3931 LastLane = Cost->isUniformAfterVectorization(
3932 cast<Instruction>(IncomingValue), VF)
3933 ? 0
3934 : VF - 1;
3935 // Can be a loop invariant incoming value or the last scalar value to be
3936 // extracted from the vectorized loop.
3937 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3938 Value *lastIncomingValue =
3939 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3940 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3941 }
3942 }
3943}
3944
3945void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3946 // The basic block and loop containing the predicated instruction.
3947 auto *PredBB = PredInst->getParent();
3948 auto *VectorLoop = LI->getLoopFor(PredBB);
3949
3950 // Initialize a worklist with the operands of the predicated instruction.
3951 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3952
3953 // Holds instructions that we need to analyze again. An instruction may be
3954 // reanalyzed if we don't yet know if we can sink it or not.
3955 SmallVector<Instruction *, 8> InstsToReanalyze;
3956
3957 // Returns true if a given use occurs in the predicated block. Phi nodes use
3958 // their operands in their corresponding predecessor blocks.
3959 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3960 auto *I = cast<Instruction>(U.getUser());
3961 BasicBlock *BB = I->getParent();
3962 if (auto *Phi = dyn_cast<PHINode>(I))
3963 BB = Phi->getIncomingBlock(
3964 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3965 return BB == PredBB;
3966 };
3967
3968 // Iteratively sink the scalarized operands of the predicated instruction
3969 // into the block we created for it. When an instruction is sunk, it's
3970 // operands are then added to the worklist. The algorithm ends after one pass
3971 // through the worklist doesn't sink a single instruction.
3972 bool Changed;
3973 do {
3974 // Add the instructions that need to be reanalyzed to the worklist, and
3975 // reset the changed indicator.
3976 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3977 InstsToReanalyze.clear();
3978 Changed = false;
3979
3980 while (!Worklist.empty()) {
3981 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3982
3983 // We can't sink an instruction if it is a phi node, is already in the
3984 // predicated block, is not in the loop, or may have side effects.
3985 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3986 !VectorLoop->contains(I) || I->mayHaveSideEffects())
3987 continue;
3988
3989 // It's legal to sink the instruction if all its uses occur in the
3990 // predicated block. Otherwise, there's nothing to do yet, and we may
3991 // need to reanalyze the instruction.
3992 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3993 InstsToReanalyze.push_back(I);
3994 continue;
3995 }
3996
3997 // Move the instruction to the beginning of the predicated block, and add
3998 // it's operands to the worklist.
3999 I->moveBefore(&*PredBB->getFirstInsertionPt());
4000 Worklist.insert(I->op_begin(), I->op_end());
4001
4002 // The sinking may have enabled other instructions to be sunk, so we will
4003 // need to iterate.
4004 Changed = true;
4005 }
4006 } while (Changed);
4007}
4008
4009void InnerLoopVectorizer::fixNonInductionPHIs() {
4010 for (PHINode *OrigPhi : OrigPHIsToFix) {
4011 PHINode *NewPhi =
4012 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4013 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4014
4015 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4016 predecessors(OrigPhi->getParent()));
4017 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4018 predecessors(NewPhi->getParent()));
4019 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4020, __PRETTY_FUNCTION__))
4020 "Scalar and Vector BB should have the same number of predecessors")((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4020, __PRETTY_FUNCTION__))
;
4021
4022 // The insertion point in Builder may be invalidated by the time we get
4023 // here. Force the Builder insertion point to something valid so that we do
4024 // not run into issues during insertion point restore in
4025 // getOrCreateVectorValue calls below.
4026 Builder.SetInsertPoint(NewPhi);
4027
4028 // The predecessor order is preserved and we can rely on mapping between
4029 // scalar and vector block predecessors.
4030 for (unsigned i = 0; i < NumIncomingValues; ++i) {
4031 BasicBlock *NewPredBB = VectorBBPredecessors[i];
4032
4033 // When looking up the new scalar/vector values to fix up, use incoming
4034 // values from original phi.
4035 Value *ScIncV =
4036 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4037
4038 // Scalar incoming value may need a broadcast
4039 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4040 NewPhi->addIncoming(NewIncV, NewPredBB);
4041 }
4042 }
4043}
4044
4045void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4046 unsigned VF, bool IsPtrLoopInvariant,
4047 SmallBitVector &IsIndexLoopInvariant) {
4048 // Construct a vector GEP by widening the operands of the scalar GEP as
4049 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4050 // results in a vector of pointers when at least one operand of the GEP
4051 // is vector-typed. Thus, to keep the representation compact, we only use
4052 // vector-typed operands for loop-varying values.
4053
4054 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4055 // If we are vectorizing, but the GEP has only loop-invariant operands,
4056 // the GEP we build (by only using vector-typed operands for
4057 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4058 // produce a vector of pointers, we need to either arbitrarily pick an
4059 // operand to broadcast, or broadcast a clone of the original GEP.
4060 // Here, we broadcast a clone of the original.
4061 //
4062 // TODO: If at some point we decide to scalarize instructions having
4063 // loop-invariant operands, this special case will no longer be
4064 // required. We would add the scalarization decision to
4065 // collectLoopScalars() and teach getVectorValue() to broadcast
4066 // the lane-zero scalar value.
4067 auto *Clone = Builder.Insert(GEP->clone());
4068 for (unsigned Part = 0; Part < UF; ++Part) {
4069 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4070 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4071 addMetadata(EntryPart, GEP);
4072 }
4073 } else {
4074 // If the GEP has at least one loop-varying operand, we are sure to
4075 // produce a vector of pointers. But if we are only unrolling, we want
4076 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4077 // produce with the code below will be scalar (if VF == 1) or vector
4078 // (otherwise). Note that for the unroll-only case, we still maintain
4079 // values in the vector mapping with initVector, as we do for other
4080 // instructions.
4081 for (unsigned Part = 0; Part < UF; ++Part) {
4082 // The pointer operand of the new GEP. If it's loop-invariant, we
4083 // won't broadcast it.
4084 auto *Ptr = IsPtrLoopInvariant
4085 ? GEP->getPointerOperand()
4086 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4087
4088 // Collect all the indices for the new GEP. If any index is
4089 // loop-invariant, we won't broadcast it.
4090 SmallVector<Value *, 4> Indices;
4091 for (auto Index : enumerate(GEP->indices())) {
4092 Value *User = Index.value().get();
4093 if (IsIndexLoopInvariant[Index.index()])
4094 Indices.push_back(User);
4095 else
4096 Indices.push_back(getOrCreateVectorValue(User, Part));
4097 }
4098
4099 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4100 // but it should be a vector, otherwise.
4101 auto *NewGEP =
4102 GEP->isInBounds()
4103 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4104 Indices)
4105 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4106 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4107, __PRETTY_FUNCTION__))
4107 "NewGEP is not a pointer vector")(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4107, __PRETTY_FUNCTION__))
;
4108 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4109 addMetadata(NewGEP, GEP);
4110 }
4111 }
4112}
4113
4114void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4115 unsigned VF) {
4116 PHINode *P = cast<PHINode>(PN);
4117 if (EnableVPlanNativePath) {
4118 // Currently we enter here in the VPlan-native path for non-induction
4119 // PHIs where all control flow is uniform. We simply widen these PHIs.
4120 // Create a vector phi with no operands - the vector phi operands will be
4121 // set at the end of vector code generation.
4122 Type *VecTy =
4123 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4124 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4125 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4126 OrigPHIsToFix.push_back(P);
4127
4128 return;
4129 }
4130
4131 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4132, __PRETTY_FUNCTION__))
4132 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4132, __PRETTY_FUNCTION__))
;
4133
4134 // In order to support recurrences we need to be able to vectorize Phi nodes.
4135 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4136 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4137 // this value when we vectorize all of the instructions that use the PHI.
4138 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4139 for (unsigned Part = 0; Part < UF; ++Part) {
4140 // This is phase one of vectorizing PHIs.
4141 Type *VecTy =
4142 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4143 Value *EntryPart = PHINode::Create(
4144 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4145 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4146 }
4147 return;
4148 }
4149
4150 setDebugLocFromInst(Builder, P);
4151
4152 // This PHINode must be an induction variable.
4153 // Make sure that we know about it.
4154 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4154, __PRETTY_FUNCTION__))
;
4155
4156 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4157 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4158
4159 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4160 // which can be found from the original scalar operations.
4161 switch (II.getKind()) {
4162 case InductionDescriptor::IK_NoInduction:
4163 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4163)
;
4164 case InductionDescriptor::IK_IntInduction:
4165 case InductionDescriptor::IK_FpInduction:
4166 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4166)
;
4167 case InductionDescriptor::IK_PtrInduction: {
4168 // Handle the pointer induction variable case.
4169 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4169, __PRETTY_FUNCTION__))
;
4170 // This is the normalized GEP that starts counting at zero.
4171 Value *PtrInd = Induction;
4172 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4173 // Determine the number of scalars we need to generate for each unroll
4174 // iteration. If the instruction is uniform, we only need to generate the
4175 // first lane. Otherwise, we generate all VF values.
4176 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4177 // These are the scalar results. Notice that we don't generate vector GEPs
4178 // because scalar GEPs result in better code.
4179 for (unsigned Part = 0; Part < UF; ++Part) {
4180 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4181 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4182 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4183 Value *SclrGep =
4184 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4185 SclrGep->setName("next.gep");
4186 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4187 }
4188 }
4189 return;
4190 }
4191 }
4192}
4193
4194/// A helper function for checking whether an integer division-related
4195/// instruction may divide by zero (in which case it must be predicated if
4196/// executed conditionally in the scalar code).
4197/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4198/// Non-zero divisors that are non compile-time constants will not be
4199/// converted into multiplication, so we will still end up scalarizing
4200/// the division, but can do so w/o predication.
4201static bool mayDivideByZero(Instruction &I) {
4202 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
4203 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
4204 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
4205 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
4206 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
;
4207 Value *Divisor = I.getOperand(1);
4208 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4209 return !CInt || CInt->isZero();
4210}
4211
4212void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4213 switch (I.getOpcode()) {
4214 case Instruction::Br:
4215 case Instruction::PHI:
4216 case Instruction::GetElementPtr:
4217 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4217)
;
4218 case Instruction::UDiv:
4219 case Instruction::SDiv:
4220 case Instruction::SRem:
4221 case Instruction::URem:
4222 case Instruction::Add:
4223 case Instruction::FAdd:
4224 case Instruction::Sub:
4225 case Instruction::FSub:
4226 case Instruction::FNeg:
4227 case Instruction::Mul:
4228 case Instruction::FMul:
4229 case Instruction::FDiv:
4230 case Instruction::FRem:
4231 case Instruction::Shl:
4232 case Instruction::LShr:
4233 case Instruction::AShr:
4234 case Instruction::And:
4235 case Instruction::Or:
4236 case Instruction::Xor: {
4237 // Just widen unops and binops.
4238 setDebugLocFromInst(Builder, &I);
4239
4240 for (unsigned Part = 0; Part < UF; ++Part) {
4241 SmallVector<Value *, 2> Ops;
4242 for (Value *Op : I.operands())
4243 Ops.push_back(getOrCreateVectorValue(Op, Part));
4244
4245 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4246
4247 if (auto *VecOp = dyn_cast<Instruction>(V))
4248 VecOp->copyIRFlags(&I);
4249
4250 // Use this vector value for all users of the original instruction.
4251 VectorLoopValueMap.setVectorValue(&I, Part, V);
4252 addMetadata(V, &I);
4253 }
4254
4255 break;
4256 }
4257 case Instruction::Select: {
4258 // Widen selects.
4259 // If the selector is loop invariant we can create a select
4260 // instruction with a scalar condition. Otherwise, use vector-select.
4261 auto *SE = PSE.getSE();
4262 bool InvariantCond =
4263 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4264 setDebugLocFromInst(Builder, &I);
4265
4266 // The condition can be loop invariant but still defined inside the
4267 // loop. This means that we can't just use the original 'cond' value.
4268 // We have to take the 'vectorized' value and pick the first lane.
4269 // Instcombine will make this a no-op.
4270
4271 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4272
4273 for (unsigned Part = 0; Part < UF; ++Part) {
4274 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4275 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4276 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4277 Value *Sel =
4278 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4279 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4280 addMetadata(Sel, &I);
4281 }
4282
4283 break;
4284 }
4285
4286 case Instruction::ICmp:
4287 case Instruction::FCmp: {
4288 // Widen compares. Generate vector compares.
4289 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4290 auto *Cmp = cast<CmpInst>(&I);
4291 setDebugLocFromInst(Builder, Cmp);
4292 for (unsigned Part = 0; Part < UF; ++Part) {
4293 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4294 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4295 Value *C = nullptr;
4296 if (FCmp) {
4297 // Propagate fast math flags.
4298 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4299 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4300 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4301 } else {
4302 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4303 }
4304 VectorLoopValueMap.setVectorValue(&I, Part, C);
4305 addMetadata(C, &I);
4306 }
4307
4308 break;
4309 }
4310
4311 case Instruction::ZExt:
4312 case Instruction::SExt:
4313 case Instruction::FPToUI:
4314 case Instruction::FPToSI:
4315 case Instruction::FPExt:
4316 case Instruction::PtrToInt:
4317 case Instruction::IntToPtr:
4318 case Instruction::SIToFP:
4319 case Instruction::UIToFP:
4320 case Instruction::Trunc:
4321 case Instruction::FPTrunc:
4322 case Instruction::BitCast: {
4323 auto *CI = cast<CastInst>(&I);
4324 setDebugLocFromInst(Builder, CI);
4325
4326 /// Vectorize casts.
4327 Type *DestTy =
4328 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4329
4330 for (unsigned Part = 0; Part < UF; ++Part) {
4331 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4332 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4333 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4334 addMetadata(Cast, &I);
4335 }
4336 break;
4337 }
4338
4339 case Instruction::Call: {
4340 // Ignore dbg intrinsics.
4341 if (isa<DbgInfoIntrinsic>(I))
4342 break;
4343 setDebugLocFromInst(Builder, &I);
4344
4345 Module *M = I.getParent()->getParent()->getParent();
4346 auto *CI = cast<CallInst>(&I);
4347
4348 SmallVector<Type *, 4> Tys;
4349 for (Value *ArgOperand : CI->arg_operands())
4350 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4351
4352 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4353
4354 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4355 // version of the instruction.
4356 // Is it beneficial to perform intrinsic call compared to lib call?
4357 bool NeedToScalarize = false;
4358 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4359 bool UseVectorIntrinsic =
4360 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4361 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4362, __PRETTY_FUNCTION__))
4362 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4362, __PRETTY_FUNCTION__))
;
4363
4364 for (unsigned Part = 0; Part < UF; ++Part) {
4365 SmallVector<Value *, 4> Args;
4366 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4367 Value *Arg = CI->getArgOperand(i);
4368 // Some intrinsics have a scalar argument - don't replace it with a
4369 // vector.
4370 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4371 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4372 Args.push_back(Arg);
4373 }
4374
4375 Function *VectorF;
4376 if (UseVectorIntrinsic) {
4377 // Use vector version of the intrinsic.
4378 Type *TysForDecl[] = {CI->getType()};
4379 if (VF > 1)
4380 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4381 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4382 } else {
4383 // Use vector version of the function call.
4384 const VFShape Shape =
4385 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4386#ifndef NDEBUG
4387 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4388 assert(std::find_if(Infos.begin(), Infos.end(),((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
4389 [&Shape](const VFInfo &Info) {((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
4390 return Info.Shape == Shape;((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
4391 }) != Infos.end() &&((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
4392 "Vector function shape is missing from the database.")((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
;
4393#endif
4394 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4395 }
4396 assert(VectorF && "Can't create vector function.")((VectorF && "Can't create vector function.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4396, __PRETTY_FUNCTION__))
;
4397
4398 SmallVector<OperandBundleDef, 1> OpBundles;
4399 CI->getOperandBundlesAsDefs(OpBundles);
4400 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4401
4402 if (isa<FPMathOperator>(V))
4403 V->copyFastMathFlags(CI);
4404
4405 VectorLoopValueMap.setVectorValue(&I, Part, V);
4406 addMetadata(V, &I);
4407 }
4408
4409 break;
4410 }
4411
4412 default:
4413 // This instruction is not vectorized by simple widening.
4414 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4415 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4415)
;
4416 } // end of switch.
4417}
4418
4419void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4420 // We should not collect Scalars more than once per VF. Right now, this
4421 // function is called from collectUniformsAndScalars(), which already does
4422 // this check. Collecting Scalars for VF=1 does not make any sense.
4423 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4424, __PRETTY_FUNCTION__))
4424 "This function should not be visited twice for the same VF")((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4424, __PRETTY_FUNCTION__))
;
4425
4426 SmallSetVector<Instruction *, 8> Worklist;
4427
4428 // These sets are used to seed the analysis with pointers used by memory
4429 // accesses that will remain scalar.
4430 SmallSetVector<Instruction *, 8> ScalarPtrs;
4431 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4432
4433 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4434 // The pointer operands of loads and stores will be scalar as long as the
4435 // memory access is not a gather or scatter operation. The value operand of a
4436 // store will remain scalar if the store is scalarized.
4437 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4438 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4439 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4440, __PRETTY_FUNCTION__))
4440 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4440, __PRETTY_FUNCTION__))
;
4441 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4442 if (Ptr == Store->getValueOperand())
4443 return WideningDecision == CM_Scalarize;
4444 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4445, __PRETTY_FUNCTION__))
4445 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4445, __PRETTY_FUNCTION__))
;
4446 return WideningDecision != CM_GatherScatter;
4447 };
4448
4449 // A helper that returns true if the given value is a bitcast or
4450 // getelementptr instruction contained in the loop.
4451 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4452 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4453 isa<GetElementPtrInst>(V)) &&
4454 !TheLoop->isLoopInvariant(V);
4455 };
4456
4457 // A helper that evaluates a memory access's use of a pointer. If the use
4458 // will be a scalar use, and the pointer is only used by memory accesses, we
4459 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4460 // PossibleNonScalarPtrs.
4461 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4462 // We only care about bitcast and getelementptr instructions contained in
4463 // the loop.
4464 if (!isLoopVaryingBitCastOrGEP(Ptr))
4465 return;
4466
4467 // If the pointer has already been identified as scalar (e.g., if it was
4468 // also identified as uniform), there's nothing to do.
4469 auto *I = cast<Instruction>(Ptr);
4470 if (Worklist.count(I))
4471 return;
4472
4473 // If the use of the pointer will be a scalar use, and all users of the
4474 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4475 // place the pointer in PossibleNonScalarPtrs.
4476 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4477 return isa<LoadInst>(U) || isa<StoreInst>(U);
4478 }))
4479 ScalarPtrs.insert(I);
4480 else
4481 PossibleNonScalarPtrs.insert(I);
4482 };
4483
4484 // We seed the scalars analysis with three classes of instructions: (1)
4485 // instructions marked uniform-after-vectorization, (2) bitcast and
4486 // getelementptr instructions used by memory accesses requiring a scalar use,
4487 // and (3) pointer induction variables and their update instructions (we
4488 // currently only scalarize these).
4489 //
4490 // (1) Add to the worklist all instructions that have been identified as
4491 // uniform-after-vectorization.
4492 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4493
4494 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4495 // memory accesses requiring a scalar use. The pointer operands of loads and
4496 // stores will be scalar as long as the memory accesses is not a gather or
4497 // scatter operation. The value operand of a store will remain scalar if the
4498 // store is scalarized.
4499 for (auto *BB : TheLoop->blocks())
4500 for (auto &I : *BB) {
4501 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4502 evaluatePtrUse(Load, Load->getPointerOperand());
4503 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4504 evaluatePtrUse(Store, Store->getPointerOperand());
4505 evaluatePtrUse(Store, Store->getValueOperand());
4506 }
4507 }
4508 for (auto *I : ScalarPtrs)
4509 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4510 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4511 Worklist.insert(I);
4512 }
4513
4514 // (3) Add to the worklist all pointer induction variables and their update
4515 // instructions.
4516 //
4517 // TODO: Once we are able to vectorize pointer induction variables we should
4518 // no longer insert them into the worklist here.
4519 auto *Latch = TheLoop->getLoopLatch();
4520 for (auto &Induction : Legal->getInductionVars()) {
4521 auto *Ind = Induction.first;
4522 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4523 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4524 continue;
4525 Worklist.insert(Ind);
4526 Worklist.insert(IndUpdate);
4527 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4528 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4529 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4530 }
4531
4532 // Insert the forced scalars.
4533 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4534 // induction variable when the PHI user is scalarized.
4535 auto ForcedScalar = ForcedScalars.find(VF);
4536 if (ForcedScalar != ForcedScalars.end())
4537 for (auto *I : ForcedScalar->second)
4538 Worklist.insert(I);
4539
4540 // Expand the worklist by looking through any bitcasts and getelementptr
4541 // instructions we've already identified as scalar. This is similar to the
4542 // expansion step in collectLoopUniforms(); however, here we're only
4543 // expanding to include additional bitcasts and getelementptr instructions.
4544 unsigned Idx = 0;
4545 while (Idx != Worklist.size()) {
4546 Instruction *Dst = Worklist[Idx++];
4547 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4548 continue;
4549 auto *Src = cast<Instruction>(Dst->getOperand(0));
4550 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4551 auto *J = cast<Instruction>(U);
4552 return !TheLoop->contains(J) || Worklist.count(J) ||
4553 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4554 isScalarUse(J, Src));
4555 })) {
4556 Worklist.insert(Src);
4557 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4558 }
4559 }
4560
4561 // An induction variable will remain scalar if all users of the induction
4562 // variable and induction variable update remain scalar.
4563 for (auto &Induction : Legal->getInductionVars()) {
4564 auto *Ind = Induction.first;
4565 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4566
4567 // We already considered pointer induction variables, so there's no reason
4568 // to look at their users again.
4569 //
4570 // TODO: Once we are able to vectorize pointer induction variables we
4571 // should no longer skip over them here.
4572 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4573 continue;
4574
4575 // Determine if all users of the induction variable are scalar after
4576 // vectorization.
4577 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4578 auto *I = cast<Instruction>(U);
4579 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4580 });
4581 if (!ScalarInd)
4582 continue;
4583
4584 // Determine if all users of the induction variable update instruction are
4585 // scalar after vectorization.
4586 auto ScalarIndUpdate =
4587 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4588 auto *I = cast<Instruction>(U);
4589 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4590 });
4591 if (!ScalarIndUpdate)
4592 continue;
4593
4594 // The induction variable and its update instruction will remain scalar.
4595 Worklist.insert(Ind);
4596 Worklist.insert(IndUpdate);
4597 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4598 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4599 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4600 }
4601
4602 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4603}
4604
4605bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4606 if (!blockNeedsPredication(I->getParent()))
4607 return false;
4608 switch(I->getOpcode()) {
4609 default:
4610 break;
4611 case Instruction::Load:
4612 case Instruction::Store: {
4613 if (!Legal->isMaskRequired(I))
4614 return false;
4615 auto *Ptr = getLoadStorePointerOperand(I);
4616 auto *Ty = getMemInstValueType(I);
4617 // We have already decided how to vectorize this instruction, get that
4618 // result.
4619 if (VF > 1) {
4620 InstWidening WideningDecision = getWideningDecision(I, VF);
4621 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4622, __PRETTY_FUNCTION__))
4622 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4622, __PRETTY_FUNCTION__))
;
4623 return WideningDecision == CM_Scalarize;
4624 }
4625 const MaybeAlign Alignment = getLoadStoreAlignment(I);
4626 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4627 isLegalMaskedGather(Ty, Alignment))
4628 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4629 isLegalMaskedScatter(Ty, Alignment));
4630 }
4631 case Instruction::UDiv:
4632 case Instruction::SDiv:
4633 case Instruction::SRem:
4634 case Instruction::URem:
4635 return mayDivideByZero(*I);
4636 }
4637 return false;
4638}
4639
4640bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4641 unsigned VF) {
4642 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4642, __PRETTY_FUNCTION__))
;
4643 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4644, __PRETTY_FUNCTION__))
4644 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4644, __PRETTY_FUNCTION__))
;
4645 auto *Group = getInterleavedAccessGroup(I);
4646 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4646, __PRETTY_FUNCTION__))
;
4647
4648 // If the instruction's allocated size doesn't equal it's type size, it
4649 // requires padding and will be scalarized.
4650 auto &DL = I->getModule()->getDataLayout();
4651 auto *ScalarTy = getMemInstValueType(I);
4652 if (hasIrregularType(ScalarTy, DL, VF))
4653 return false;
4654
4655 // Check if masking is required.
4656 // A Group may need masking for one of two reasons: it resides in a block that
4657 // needs predication, or it was decided to use masking to deal with gaps.
4658 bool PredicatedAccessRequiresMasking =
4659 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4660 bool AccessWithGapsRequiresMasking =
4661 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4662 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4663 return true;
4664
4665 // If masked interleaving is required, we expect that the user/target had
4666 // enabled it, because otherwise it either wouldn't have been created or
4667 // it should have been invalidated by the CostModel.
4668 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__))
4669 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__))
;
4670
4671 auto *Ty = getMemInstValueType(I);
4672 const MaybeAlign Alignment = getLoadStoreAlignment(I);
4673 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4674 : TTI.isLegalMaskedStore(Ty, Alignment);
4675}
4676
4677bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4678 unsigned VF) {
4679 // Get and ensure we have a valid memory instruction.
4680 LoadInst *LI = dyn_cast<LoadInst>(I);
4681 StoreInst *SI = dyn_cast<StoreInst>(I);
4682 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4682, __PRETTY_FUNCTION__))
;
4683
4684 auto *Ptr = getLoadStorePointerOperand(I);
4685
4686 // In order to be widened, the pointer should be consecutive, first of all.
4687 if (!Legal->isConsecutivePtr(Ptr))
4688 return false;
4689
4690 // If the instruction is a store located in a predicated block, it will be
4691 // scalarized.
4692 if (isScalarWithPredication(I))
4693 return false;
4694
4695 // If the instruction's allocated size doesn't equal it's type size, it
4696 // requires padding and will be scalarized.
4697 auto &DL = I->getModule()->getDataLayout();
4698 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4699 if (hasIrregularType(ScalarTy, DL, VF))
4700 return false;
4701
4702 return true;
4703}
4704
4705void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4706 // We should not collect Uniforms more than once per VF. Right now,
4707 // this function is called from collectUniformsAndScalars(), which
4708 // already does this check. Collecting Uniforms for VF=1 does not make any
4709 // sense.
4710
4711 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4712, __PRETTY_FUNCTION__))
4712 "This function should not be visited twice for the same VF")((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4712, __PRETTY_FUNCTION__))
;
4713
4714 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4715 // not analyze again. Uniforms.count(VF) will return 1.
4716 Uniforms[VF].clear();
4717
4718 // We now know that the loop is vectorizable!
4719 // Collect instructions inside the loop that will remain uniform after
4720 // vectorization.
4721
4722 // Global values, params and instructions outside of current loop are out of
4723 // scope.
4724 auto isOutOfScope = [&](Value *V) -> bool {
4725 Instruction *I = dyn_cast<Instruction>(V);
4726 return (!I || !TheLoop->contains(I));
4727 };
4728
4729 SetVector<Instruction *> Worklist;
4730 BasicBlock *Latch = TheLoop->getLoopLatch();
4731
4732 // Instructions that are scalar with predication must not be considered
4733 // uniform after vectorization, because that would create an erroneous
4734 // replicating region where only a single instance out of VF should be formed.
4735 // TODO: optimize such seldom cases if found important, see PR40816.
4736 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4737 if (isScalarWithPredication(I, VF)) {
4738 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4739 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4740 return;
4741 }
4742 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4743 Worklist.insert(I);
4744 };
4745
4746 // Start with the conditional branch. If the branch condition is an
4747 // instruction contained in the loop that is only used by the branch, it is
4748 // uniform.
4749 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4750 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4751 addToWorklistIfAllowed(Cmp);
4752
4753 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4754 // are pointers that are treated like consecutive pointers during
4755 // vectorization. The pointer operands of interleaved accesses are an
4756 // example.
4757 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4758
4759 // Holds pointer operands of instructions that are possibly non-uniform.
4760 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4761
4762 auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4763 InstWidening WideningDecision = getWideningDecision(I, VF);
4764 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
4765 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
;
4766
4767 return (WideningDecision == CM_Widen ||
4768 WideningDecision == CM_Widen_Reverse ||
4769 WideningDecision == CM_Interleave);
4770 };
4771 // Iterate over the instructions in the loop, and collect all
4772 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4773 // that a consecutive-like pointer operand will be scalarized, we collect it
4774 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4775 // getelementptr instruction can be used by both vectorized and scalarized
4776 // memory instructions. For example, if a loop loads and stores from the same
4777 // location, but the store is conditional, the store will be scalarized, and
4778 // the getelementptr won't remain uniform.
4779 for (auto *BB : TheLoop->blocks())
4780 for (auto &I : *BB) {
4781 // If there's no pointer operand, there's nothing to do.
4782 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4783 if (!Ptr)
4784 continue;
4785
4786 // True if all users of Ptr are memory accesses that have Ptr as their
4787 // pointer operand.
4788 auto UsersAreMemAccesses =
4789 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4790 return getLoadStorePointerOperand(U) == Ptr;
4791 });
4792
4793 // Ensure the memory instruction will not be scalarized or used by
4794 // gather/scatter, making its pointer operand non-uniform. If the pointer
4795 // operand is used by any instruction other than a memory access, we
4796 // conservatively assume the pointer operand may be non-uniform.
4797 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4798 PossibleNonUniformPtrs.insert(Ptr);
4799
4800 // If the memory instruction will be vectorized and its pointer operand
4801 // is consecutive-like, or interleaving - the pointer operand should
4802 // remain uniform.
4803 else
4804 ConsecutiveLikePtrs.insert(Ptr);
4805 }
4806
4807 // Add to the Worklist all consecutive and consecutive-like pointers that
4808 // aren't also identified as possibly non-uniform.
4809 for (auto *V : ConsecutiveLikePtrs)
4810 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4811 addToWorklistIfAllowed(V);
4812
4813 // Expand Worklist in topological order: whenever a new instruction
4814 // is added , its users should be already inside Worklist. It ensures
4815 // a uniform instruction will only be used by uniform instructions.
4816 unsigned idx = 0;
4817 while (idx != Worklist.size()) {
4818 Instruction *I = Worklist[idx++];
4819
4820 for (auto OV : I->operand_values()) {
4821 // isOutOfScope operands cannot be uniform instructions.
4822 if (isOutOfScope(OV))
4823 continue;
4824 // First order recurrence Phi's should typically be considered
4825 // non-uniform.
4826 auto *OP = dyn_cast<PHINode>(OV);
4827 if (OP && Legal->isFirstOrderRecurrence(OP))
4828 continue;
4829 // If all the users of the operand are uniform, then add the
4830 // operand into the uniform worklist.
4831 auto *OI = cast<Instruction>(OV);
4832 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4833 auto *J = cast<Instruction>(U);
4834 return Worklist.count(J) ||
4835 (OI == getLoadStorePointerOperand(J) &&
4836 isUniformDecision(J, VF));
4837 }))
4838 addToWorklistIfAllowed(OI);
4839 }
4840 }
4841
4842 // Returns true if Ptr is the pointer operand of a memory access instruction
4843 // I, and I is known to not require scalarization.
4844 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4845 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4846 };
4847
4848 // For an instruction to be added into Worklist above, all its users inside
4849 // the loop should also be in Worklist. However, this condition cannot be
4850 // true for phi nodes that form a cyclic dependence. We must process phi
4851 // nodes separately. An induction variable will remain uniform if all users
4852 // of the induction variable and induction variable update remain uniform.
4853 // The code below handles both pointer and non-pointer induction variables.
4854 for (auto &Induction : Legal->getInductionVars()) {
4855 auto *Ind = Induction.first;
4856 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4857
4858 // Determine if all users of the induction variable are uniform after
4859 // vectorization.
4860 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4861 auto *I = cast<Instruction>(U);
4862 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4863 isVectorizedMemAccessUse(I, Ind);
4864 });
4865 if (!UniformInd)
4866 continue;
4867
4868 // Determine if all users of the induction variable update instruction are
4869 // uniform after vectorization.
4870 auto UniformIndUpdate =
4871 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4872 auto *I = cast<Instruction>(U);
4873 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4874 isVectorizedMemAccessUse(I, IndUpdate);
4875 });
4876 if (!UniformIndUpdate)
4877 continue;
4878
4879 // The induction variable and its update instruction will remain uniform.
4880 addToWorklistIfAllowed(Ind);
4881 addToWorklistIfAllowed(IndUpdate);
4882 }
4883
4884 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4885}
4886
4887bool LoopVectorizationCostModel::runtimeChecksRequired() {
4888 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4889
4890 if (Legal->getRuntimePointerChecking()->Need) {
4891 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4892 "runtime pointer checks needed. Enable vectorization of this "
4893 "loop with '#pragma clang loop vectorize(enable)' when "
4894 "compiling with -Os/-Oz",
4895 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4896 return true;
4897 }
4898
4899 if (!PSE.getUnionPredicate().getPredicates().empty()) {
4900 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4901 "runtime SCEV checks needed. Enable vectorization of this "
4902 "loop with '#pragma clang loop vectorize(enable)' when "
4903 "compiling with -Os/-Oz",
4904 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4905 return true;
4906 }
4907
4908 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4909 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4910 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4911 "runtime stride == 1 checks needed. Enable vectorization of "
4912 "this loop with '#pragma clang loop vectorize(enable)' when "
4913 "compiling with -Os/-Oz",
4914 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4915 return true;
4916 }
4917
4918 return false;
4919}
4920
4921Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4922 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4923 // TODO: It may by useful to do since it's still likely to be dynamically
4924 // uniform if the target can skip.
4925 reportVectorizationFailure(
4926 "Not inserting runtime ptr check for divergent target",
4927 "runtime pointer checks needed. Not enabled for divergent target",
4928 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4929 return None;
4930 }
4931
4932 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4933 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
4934 if (TC == 1) {
4935 reportVectorizationFailure("Single iteration (non) loop",
4936 "loop trip count is one, irrelevant for vectorization",
4937 "SingleIterationLoop", ORE, TheLoop);
4938 return None;
4939 }
4940
4941 switch (ScalarEpilogueStatus) {
4942 case CM_ScalarEpilogueAllowed:
4943 return computeFeasibleMaxVF(TC);
4944 case CM_ScalarEpilogueNotNeededUsePredicate:
4945 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
4946 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
4947 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
4948 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
4949 break;
4950 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4951 // fallthrough as a special case of OptForSize
4952 case CM_ScalarEpilogueNotAllowedOptSize:
4953 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4954 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
4955 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
4956 else
4957 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
4958 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
4959
4960 // Bail if runtime checks are required, which are not good when optimising
4961 // for size.
4962 if (runtimeChecksRequired())
4963 return None;
4964 break;
4965 }
4966
4967 // Now try the tail folding
4968
4969 // Invalidate interleave groups that require an epilogue if we can't mask
4970 // the interleave-group.
4971 if (!useMaskedInterleavedAccesses(TTI))
4972 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4973
4974 unsigned MaxVF = computeFeasibleMaxVF(TC);
4975 if (TC > 0 && TC % MaxVF == 0) {
4976 // Accept MaxVF if we do not have a tail.
4977 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
4978 return MaxVF;
4979 }
4980
4981 // If we don't know the precise trip count, or if the trip count that we
4982 // found modulo the vectorization factor is not zero, try to fold the tail
4983 // by masking.
4984 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4985 if (Legal->prepareToFoldTailByMasking()) {
4986 FoldTailByMasking = true;
4987 return MaxVF;
4988 }
4989
4990 if (TC == 0) {
4991 reportVectorizationFailure(
4992 "Unable to calculate the loop count due to complex control flow",
4993 "unable to calculate the loop count due to complex control flow",
4994 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4995 return None;
4996 }
4997
4998 reportVectorizationFailure(
4999 "Cannot optimize for size and vectorize at the same time.",
5000 "cannot optimize for size and vectorize at the same time. "
5001 "Enable vectorization of this loop with '#pragma clang loop "
5002 "vectorize(enable)' when compiling with -Os/-Oz",
5003 "NoTailLoopWithOptForSize", ORE, TheLoop);
5004 return None;
5005}
5006
5007unsigned
5008LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5009 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5010 unsigned SmallestType, WidestType;
5011 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5012 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5013
5014 // Get the maximum safe dependence distance in bits computed by LAA.
5015 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5016 // the memory accesses that is most restrictive (involved in the smallest
5017 // dependence distance).
5018 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5019
5020 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5021
5022 unsigned MaxVectorSize = WidestRegister / WidestType;
5023
5024 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5025 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5026 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
5027 << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
;
5028
5029 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5030, __PRETTY_FUNCTION__))
5030 " into one vector!")((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5030, __PRETTY_FUNCTION__))
;
5031 if (MaxVectorSize == 0) {
5032 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false)
;
5033 MaxVectorSize = 1;
5034 return MaxVectorSize;
5035 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5036 isPowerOf2_32(ConstTripCount)) {
5037 // We need to clamp the VF to be the ConstTripCount. There is no point in
5038 // choosing a higher viable VF as done in the loop below.
5039 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
5040 << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
;
5041 MaxVectorSize = ConstTripCount;
5042 return MaxVectorSize;
5043 }
5044
5045 unsigned MaxVF = MaxVectorSize;
5046 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5047 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5048 // Collect all viable vectorization factors larger than the default MaxVF
5049 // (i.e. MaxVectorSize).
5050 SmallVector<unsigned, 8> VFs;
5051 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5052 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5053 VFs.push_back(VS);
5054
5055 // For each VF calculate its register usage.
5056 auto RUs = calculateRegisterUsage(VFs);
5057
5058 // Select the largest VF which doesn't require more registers than existing
5059 // ones.
5060 for (int i = RUs.size() - 1; i >= 0; --i) {
5061 bool Selected = true;
5062 for (auto& pair : RUs[i].MaxLocalUsers) {
5063 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5064 if (pair.second > TargetNumRegisters)
5065 Selected = false;
5066 }
5067 if (Selected) {
5068 MaxVF = VFs[i];
5069 break;
5070 }
5071 }
5072 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5073 if (MaxVF < MinVF) {
5074 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5075 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5076 MaxVF = MinVF;
5077 }
5078 }
5079 }
5080 return MaxVF;
5081}
5082
5083VectorizationFactor
5084LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5085 float Cost = expectedCost(1).first;
5086 const float ScalarCost = Cost;
5087 unsigned Width = 1;
5088 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< (int)ScalarCost << ".\n"; } } while (false)
;
5089
5090 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5091 if (ForceVectorization && MaxVF > 1) {
5092 // Ignore scalar width, because the user explicitly wants vectorization.
5093 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5094 // evaluation.
5095 Cost = std::numeric_limits<float>::max();
5096 }
5097
5098 for (unsigned i = 2; i <= MaxVF; i *= 2) {
5099 // Notice that the vector loop needs to be executed less times, so
5100 // we need to divide the cost of the vector loops by the width of
5101 // the vector elements.
5102 VectorizationCostTy C = expectedCost(i);
5103 float VectorCost = C.first / (float)i;
5104 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
5105 << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
;
5106 if (!C.second && !ForceVectorization) {
5107 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5108 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5109 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5110 continue;
5111 }
5112 if (VectorCost < Cost) {
5113 Cost = VectorCost;
5114 Width = i;
5115 }
5116 }
5117
5118 if (!EnableCondStoresVectorization && NumPredStores) {
5119 reportVectorizationFailure("There are conditional stores.",
5120 "store that is conditionally executed prevents vectorization",
5121 "ConditionalStore", ORE, TheLoop);
5122 Width = 1;
5123 Cost = ScalarCost;
5124 }
5125
5126 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5127 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5128 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
5129 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
Width << ".\n"; } } while (false)
;
5130 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5131 return Factor;
5132}
5133
5134std::pair<unsigned, unsigned>
5135LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5136 unsigned MinWidth = -1U;
5137 unsigned MaxWidth = 8;
5138 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5139
5140 // For each block.
5141 for (BasicBlock *BB : TheLoop->blocks()) {
5142 // For each instruction in the loop.
5143 for (Instruction &I : BB->instructionsWithoutDebug()) {
5144 Type *T = I.getType();
5145
5146 // Skip ignored values.
5147 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5148 continue;
5149
5150 // Only examine Loads, Stores and PHINodes.
5151 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5152 continue;
5153
5154 // Examine PHI nodes that are reduction variables. Update the type to
5155 // account for the recurrence type.
5156 if (auto *PN = dyn_cast<PHINode>(&I)) {
5157 if (!Legal->isReductionVariable(PN))
5158 continue;
5159 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5160 T = RdxDesc.getRecurrenceType();
5161 }
5162
5163 // Examine the stored values.
5164 if (auto *ST = dyn_cast<StoreInst>(&I))
5165 T = ST->getValueOperand()->getType();
5166
5167 // Ignore loaded pointer types and stored pointer types that are not
5168 // vectorizable.
5169 //
5170 // FIXME: The check here attempts to predict whether a load or store will
5171 // be vectorized. We only know this for certain after a VF has
5172 // been selected. Here, we assume that if an access can be
5173 // vectorized, it will be. We should also look at extending this
5174 // optimization to non-pointer types.
5175 //
5176 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5177 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5178 continue;
5179
5180 MinWidth = std::min(MinWidth,
5181 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5182 MaxWidth = std::max(MaxWidth,
5183 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5184 }
5185 }
5186
5187 return {MinWidth, MaxWidth};
5188}
5189
5190unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5191 unsigned LoopCost) {
5192 // -- The interleave heuristics --
5193 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5194 // There are many micro-architectural considerations that we can't predict
5195 // at this level. For example, frontend pressure (on decode or fetch) due to
5196 // code size, or the number and capabilities of the execution ports.
5197 //
5198 // We use the following heuristics to select the interleave count:
5199 // 1. If the code has reductions, then we interleave to break the cross
5200 // iteration dependency.
5201 // 2. If the loop is really small, then we interleave to reduce the loop
5202 // overhead.
5203 // 3. We don't interleave if we think that we will spill registers to memory
5204 // due to the increased register pressure.
5205
5206 if (!isScalarEpilogueAllowed())
5207 return 1;
5208
5209 // We used the distance for the interleave count.
5210 if (Legal->getMaxSafeDepDistBytes() != -1U)
5211 return 1;
5212
5213 // Do not interleave loops with a relatively small known or estimated trip
5214 // count.
5215 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5216 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5217 return 1;
5218
5219 RegisterUsage R = calculateRegisterUsage({VF})[0];
5220 // We divide by these constants so assume that we have at least one
5221 // instruction that uses at least one register.
5222 for (auto& pair : R.MaxLocalUsers) {
5223 pair.second = std::max(pair.second, 1U);
5224 }
5225
5226 // We calculate the interleave count using the following formula.
5227 // Subtract the number of loop invariants from the number of available
5228 // registers. These registers are used by all of the interleaved instances.
5229 // Next, divide the remaining registers by the number of registers that is
5230 // required by the loop, in order to estimate how many parallel instances
5231 // fit without causing spills. All of this is rounded down if necessary to be
5232 // a power of two. We want power of two interleave count to simplify any
5233 // addressing operations or alignment considerations.
5234 // We also want power of two interleave counts to ensure that the induction
5235 // variable of the vector loop wraps to zero, when tail is folded by masking;
5236 // this currently happens when OptForSize, in which case IC is set to 1 above.
5237 unsigned IC = UINT_MAX(2147483647 *2U +1U);
5238
5239 for (auto& pair : R.MaxLocalUsers) {
5240 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5241 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5242 << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5243 << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
;
5244 if (VF == 1) {
5245 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5246 TargetNumRegisters = ForceTargetNumScalarRegs;
5247 } else {
5248 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5249 TargetNumRegisters = ForceTargetNumVectorRegs;
5250 }
5251 unsigned MaxLocalUsers = pair.second;
5252 unsigned LoopInvariantRegs = 0;
5253 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5254 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5255
5256 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5257 // Don't count the induction variable as interleaved.
5258 if (EnableIndVarRegisterHeur) {
5259 TmpIC =
5260 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5261 std::max(1U, (MaxLocalUsers - 1)));
5262 }
5263
5264 IC = std::min(IC, TmpIC);
5265 }
5266
5267 // Clamp the interleave ranges to reasonable counts.
5268 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5269
5270 // Check if the user has overridden the max.
5271 if (VF == 1) {
5272 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5273 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5274 } else {
5275 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5276 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5277 }
5278
5279 // If trip count is known or estimated compile time constant, limit the
5280 // interleave count to be less than the trip count divided by VF.
5281 if (BestKnownTC) {
5282 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5283 }
5284
5285 // If we did not calculate the cost for VF (because the user selected the VF)
5286 // then we calculate the cost of VF here.
5287 if (LoopCost == 0)
5288 LoopCost = expectedCost(VF).first;
5289
5290 assert(LoopCost && "Non-zero loop cost expected")((LoopCost && "Non-zero loop cost expected") ? static_cast
<void> (0) : __assert_fail ("LoopCost && \"Non-zero loop cost expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5290, __PRETTY_FUNCTION__))
;
5291
5292 // Clamp the calculated IC to be between the 1 and the max interleave count
5293 // that the target and trip count allows.
5294 if (IC > MaxInterleaveCount)
5295 IC = MaxInterleaveCount;
5296 else if (IC < 1)
5297 IC = 1;
5298
5299 // Interleave if we vectorized this loop and there is a reduction that could
5300 // benefit from interleaving.
5301 if (VF > 1 && !Legal->getReductionVars().empty()) {
5302 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
5303 return IC;
5304 }
5305
5306 // Note that if we've already vectorized the loop we will have done the
5307 // runtime check and so interleaving won't require further checks.
5308 bool InterleavingRequiresRuntimePointerCheck =
5309 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5310
5311 // We want to interleave small loops in order to reduce the loop overhead and
5312 // potentially expose ILP opportunities.
5313 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n'; } } while (false)
;
5314 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5315 // We assume that the cost overhead is 1 and we use the cost model
5316 // to estimate the cost of the loop and interleave until the cost of the
5317 // loop overhead is about 5% of the cost of the loop.
5318 unsigned SmallIC =
5319 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5320
5321 // Interleave until store/load ports (estimated by max interleave count) are
5322 // saturated.
5323 unsigned NumStores = Legal->getNumStores();
5324 unsigned NumLoads = Legal->getNumLoads();
5325 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5326 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5327
5328 // If we have a scalar reduction (vector reductions are already dealt with
5329 // by this point), we can increase the critical path length if the loop
5330 // we're interleaving is inside another loop. Limit, by default to 2, so the
5331 // critical path only gets increased by one reduction operation.
5332 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5333 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5334 SmallIC = std::min(SmallIC, F);
5335 StoresIC = std::min(StoresIC, F);
5336 LoadsIC = std::min(LoadsIC, F);
5337 }
5338
5339 if (EnableLoadStoreRuntimeInterleave &&
5340 std::max(StoresIC, LoadsIC) > SmallIC) {
5341 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
5342 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
5343 return std::max(StoresIC, LoadsIC);
5344 }
5345
5346 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
5347 return SmallIC;
5348 }
5349
5350 // Interleave if this is a large loop (small loops are already dealt with by
5351 // this point) that could benefit from interleaving.
5352 bool HasReductions = !Legal->getReductionVars().empty();
5353 if (TTI.enableAggressiveInterleaving(HasReductions)) {
5354 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5355 return IC;
5356 }
5357
5358 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
5359 return 1;
5360}
5361
5362SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5363LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5364 // This function calculates the register usage by measuring the highest number
5365 // of values that are alive at a single location. Obviously, this is a very
5366 // rough estimation. We scan the loop in a topological order in order and
5367 // assign a number to each instruction. We use RPO to ensure that defs are
5368 // met before their users. We assume that each instruction that has in-loop
5369 // users starts an interval. We record every time that an in-loop value is
5370 // used, so we have a list of the first and last occurrences of each
5371 // instruction. Next, we transpose this data structure into a multi map that
5372 // holds the list of intervals that *end* at a specific location. This multi
5373 // map allows us to perform a linear search. We scan the instructions linearly
5374 // and record each time that a new interval starts, by placing it in a set.
5375 // If we find this value in the multi-map then we remove it from the set.
5376 // The max register usage is the maximum size of the set.
5377 // We also search for instructions that are defined outside the loop, but are
5378 // used inside the loop. We need this number separately from the max-interval
5379 // usage number because when we unroll, loop-invariant values do not take
5380 // more register.
5381 LoopBlocksDFS DFS(TheLoop);
5382 DFS.perform(LI);
5383
5384 RegisterUsage RU;
5385
5386 // Each 'key' in the map opens a new interval. The values
5387 // of the map are the index of the 'last seen' usage of the
5388 // instruction that is the key.
5389 using IntervalMap = DenseMap<Instruction *, unsigned>;
5390
5391 // Maps instruction to its index.
5392 SmallVector<Instruction *, 64> IdxToInstr;
5393 // Marks the end of each interval.
5394 IntervalMap EndPoint;
5395 // Saves the list of instruction indices that are used in the loop.
5396 SmallPtrSet<Instruction *, 8> Ends;
5397 // Saves the list of values that are used in the loop but are
5398 // defined outside the loop, such as arguments and constants.
5399 SmallPtrSet<Value *, 8> LoopInvariants;
5400
5401 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5402 for (Instruction &I : BB->instructionsWithoutDebug()) {
5403 IdxToInstr.push_back(&I);
5404
5405 // Save the end location of each USE.
5406 for (Value *U : I.operands()) {
5407 auto *Instr = dyn_cast<Instruction>(U);
5408
5409 // Ignore non-instruction values such as arguments, constants, etc.
5410 if (!Instr)
5411 continue;
5412
5413 // If this instruction is outside the loop then record it and continue.
5414 if (!TheLoop->contains(Instr)) {
5415 LoopInvariants.insert(Instr);
5416 continue;
5417 }
5418
5419 // Overwrite previous end points.
5420 EndPoint[Instr] = IdxToInstr.size();
5421 Ends.insert(Instr);
5422 }
5423 }
5424 }
5425
5426 // Saves the list of intervals that end with the index in 'key'.
5427 using InstrList = SmallVector<Instruction *, 2>;
5428 DenseMap<unsigned, InstrList> TransposeEnds;
5429
5430 // Transpose the EndPoints to a list of values that end at each index.
5431 for (auto &Interval : EndPoint)
5432 TransposeEnds[Interval.second].push_back(Interval.first);
5433
5434 SmallPtrSet<Instruction *, 8> OpenIntervals;
5435
5436 // Get the size of the widest register.
5437 unsigned MaxSafeDepDist = -1U;
5438 if (Legal->getMaxSafeDepDistBytes() != -1U)
5439 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5440 unsigned WidestRegister =
5441 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5442 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5443
5444 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5445 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5446
5447 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
5448
5449 // A lambda that gets the register usage for the given type and VF.
5450 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5451 if (Ty->isTokenTy())
5452 return 0U;
5453 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5454 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5455 };
5456
5457 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5458 Instruction *I = IdxToInstr[i];
5459
5460 // Remove all of the instructions that end at this location.
5461 InstrList &List = TransposeEnds[i];
5462 for (Instruction *ToRemove : List)
5463 OpenIntervals.erase(ToRemove);
5464
5465 // Ignore instructions that are never used within the loop.
5466 if (Ends.find(I) == Ends.end())
5467 continue;
5468
5469 // Skip ignored values.
5470 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5471 continue;
5472
5473 // For each VF find the maximum usage of registers.
5474 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5475 // Count the number of live intervals.
5476 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5477
5478 if (VFs[j] == 1) {
5479 for (auto Inst : OpenIntervals) {
5480 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5481 if (RegUsage.find(ClassID) == RegUsage.end())
5482 RegUsage[ClassID] = 1;
5483 else
5484 RegUsage[ClassID] += 1;
5485 }
5486 } else {
5487 collectUniformsAndScalars(VFs[j]);
5488 for (auto Inst : OpenIntervals) {
5489 // Skip ignored values for VF > 1.
5490 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5491 continue;
5492 if (isScalarAfterVectorization(Inst, VFs[j])) {
5493 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5494 if (RegUsage.find(ClassID) == RegUsage.end())
5495 RegUsage[ClassID] = 1;
5496 else
5497 RegUsage[ClassID] += 1;
5498 } else {
5499 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5500 if (RegUsage.find(ClassID) == RegUsage.end())
5501 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5502 else
5503 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5504 }
5505 }
5506 }
5507
5508 for (auto& pair : RegUsage) {
5509 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5510 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5511 else
5512 MaxUsages[j][pair.first] = pair.second;
5513 }
5514 }
5515
5516 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
5517 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
5518
5519 // Add the current instruction to the list of open intervals.
5520 OpenIntervals.insert(I);
5521 }
5522
5523 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5524 SmallMapVector<unsigned, unsigned, 4> Invariant;
5525
5526 for (auto Inst : LoopInvariants) {
5527 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5528 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5529 if (Invariant.find(ClassID) == Invariant.end())
5530 Invariant[ClassID] = Usage;
5531 else
5532 Invariant[ClassID] += Usage;
5533 }
5534
5535 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5536 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5537 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5538 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5539 for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5540 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5541 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5542 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5543 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5544 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5545 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5546 for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5547 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5548 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5549 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5550 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
5551 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
;
5552
5553 RU.LoopInvariantRegs = Invariant;
5554 RU.MaxLocalUsers = MaxUsages[i];
5555 RUs[i] = RU;
5556 }
5557
5558 return RUs;
5559}
5560
5561bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5562 // TODO: Cost model for emulated masked load/store is completely
5563 // broken. This hack guides the cost model to use an artificially
5564 // high enough value to practically disable vectorization with such
5565 // operations, except where previously deployed legality hack allowed
5566 // using very low cost values. This is to avoid regressions coming simply
5567 // from moving "masked load/store" check from legality to cost model.
5568 // Masked Load/Gather emulation was previously never allowed.
5569 // Limited number of Masked Store/Scatter emulation was allowed.
5570 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction")((isPredicatedInst(I) && "Expecting a scalar emulated instruction"
) ? static_cast<void> (0) : __assert_fail ("isPredicatedInst(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5570, __PRETTY_FUNCTION__))
;
5571 return isa<LoadInst>(I) ||
5572 (isa<StoreInst>(I) &&
5573 NumPredStores > NumberOfStoresToPredicate);
5574}
5575
5576void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5577 // If we aren't vectorizing the loop, or if we've already collected the
5578 // instructions to scalarize, there's nothing to do. Collection may already
5579 // have occurred if we have a user-selected VF and are now computing the
5580 // expected cost for interleaving.
5581 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5582 return;
5583
5584 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5585 // not profitable to scalarize any instructions, the presence of VF in the
5586 // map will indicate that we've analyzed it already.
5587 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5588
5589 // Find all the instructions that are scalar with predication in the loop and
5590 // determine if it would be better to not if-convert the blocks they are in.
5591 // If so, we also record the instructions to scalarize.
5592 for (BasicBlock *BB : TheLoop->blocks()) {
5593 if (!blockNeedsPredication(BB))
5594 continue;
5595 for (Instruction &I : *BB)
5596 if (isScalarWithPredication(&I)) {
5597 ScalarCostsTy ScalarCosts;
5598 // Do not apply discount logic if hacked cost is needed
5599 // for emulated masked memrefs.
5600 if (!useEmulatedMaskMemRefHack(&I) &&
5601 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5602 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5603 // Remember that BB will remain after vectorization.
5604 PredicatedBBsAfterVectorization.insert(BB);
5605 }
5606 }
5607}
5608
5609int LoopVectorizationCostModel::computePredInstDiscount(
5610 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5611 unsigned VF) {
5612 assert(!isUniformAfterVectorization(PredInst, VF) &&((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5613, __PRETTY_FUNCTION__))
5613 "Instruction marked uniform-after-vectorization will be predicated")((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5613, __PRETTY_FUNCTION__))
;
5614
5615 // Initialize the discount to zero, meaning that the scalar version and the
5616 // vector version cost the same.
5617 int Discount = 0;
5618
5619 // Holds instructions to analyze. The instructions we visit are mapped in
5620 // ScalarCosts. Those instructions are the ones that would be scalarized if
5621 // we find that the scalar version costs less.
5622 SmallVector<Instruction *, 8> Worklist;
5623
5624 // Returns true if the given instruction can be scalarized.
5625 auto canBeScalarized = [&](Instruction *I) -> bool {
5626 // We only attempt to scalarize instructions forming a single-use chain
5627 // from the original predicated block that would otherwise be vectorized.
5628 // Although not strictly necessary, we give up on instructions we know will
5629 // already be scalar to avoid traversing chains that are unlikely to be
5630 // beneficial.
5631 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5632 isScalarAfterVectorization(I, VF))
5633 return false;
5634
5635 // If the instruction is scalar with predication, it will be analyzed
5636 // separately. We ignore it within the context of PredInst.
5637 if (isScalarWithPredication(I))
5638 return false;
5639
5640 // If any of the instruction's operands are uniform after vectorization,
5641 // the instruction cannot be scalarized. This prevents, for example, a
5642 // masked load from being scalarized.
5643 //
5644 // We assume we will only emit a value for lane zero of an instruction
5645 // marked uniform after vectorization, rather than VF identical values.
5646 // Thus, if we scalarize an instruction that uses a uniform, we would
5647 // create uses of values corresponding to the lanes we aren't emitting code
5648 // for. This behavior can be changed by allowing getScalarValue to clone
5649 // the lane zero values for uniforms rather than asserting.
5650 for (Use &U : I->operands())
5651 if (auto *J = dyn_cast<Instruction>(U.get()))
5652 if (isUniformAfterVectorization(J, VF))
5653 return false;
5654
5655 // Otherwise, we can scalarize the instruction.
5656 return true;
5657 };
5658
5659 // Compute the expected cost discount from scalarizing the entire expression
5660 // feeding the predicated instruction. We currently only consider expressions
5661 // that are single-use instruction chains.
5662 Worklist.push_back(PredInst);
5663 while (!Worklist.empty()) {
5664 Instruction *I = Worklist.pop_back_val();
5665
5666 // If we've already analyzed the instruction, there's nothing to do.
5667 if (ScalarCosts.find(I) != ScalarCosts.end())
5668 continue;
5669
5670 // Compute the cost of the vector instruction. Note that this cost already
5671 // includes the scalarization overhead of the predicated instruction.
5672 unsigned VectorCost = getInstructionCost(I, VF).first;
5673
5674 // Compute the cost of the scalarized instruction. This cost is the cost of
5675 // the instruction as if it wasn't if-converted and instead remained in the
5676 // predicated block. We will scale this cost by block probability after
5677 // computing the scalarization overhead.
5678 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5679
5680 // Compute the scalarization overhead of needed insertelement instructions
5681 // and phi nodes.
5682 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5683 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5684 true, false);
5685 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5686 }
5687
5688 // Compute the scalarization overhead of needed extractelement
5689 // instructions. For each of the instruction's operands, if the operand can
5690 // be scalarized, add it to the worklist; otherwise, account for the
5691 // overhead.
5692 for (Use &U : I->operands())
5693 if (auto *J = dyn_cast<Instruction>(U.get())) {
5694 assert(VectorType::isValidElementType(J->getType()) &&((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5695, __PRETTY_FUNCTION__))
5695 "Instruction has non-scalar type")((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5695, __PRETTY_FUNCTION__))
;
5696 if (canBeScalarized(J))
5697 Worklist.push_back(J);
5698 else if (needsExtract(J, VF))
5699 ScalarCost += TTI.getScalarizationOverhead(
5700 ToVectorTy(J->getType(),VF), false, true);
5701 }
5702
5703 // Scale the total scalar cost by block probability.
5704 ScalarCost /= getReciprocalPredBlockProb();
5705
5706 // Compute the discount. A non-negative discount means the vector version
5707 // of the instruction costs more, and scalarizing would be beneficial.
5708 Discount += VectorCost - ScalarCost;
5709 ScalarCosts[I] = ScalarCost;
5710 }
5711
5712 return Discount;
5713}
5714
5715LoopVectorizationCostModel::VectorizationCostTy
5716LoopVectorizationCostModel::expectedCost(unsigned VF) {
5717 VectorizationCostTy Cost;
5718
5719 // For each block.
5720 for (BasicBlock *BB : TheLoop->blocks()) {
5721 VectorizationCostTy BlockCost;
5722
5723 // For each instruction in the old loop.
5724 for (Instruction &I : BB->instructionsWithoutDebug()) {
5725 // Skip ignored values.
5726 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5727 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5728 continue;
5729
5730 VectorizationCostTy C = getInstructionCost(&I, VF);
5731
5732 // Check if we should override the cost.
5733 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5734 C.first = ForceTargetInstructionCost;
5735
5736 BlockCost.first += C.first;
5737 BlockCost.second |= C.second;
5738 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
5739 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
5740 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
5741 }
5742
5743 // If we are vectorizing a predicated block, it will have been
5744 // if-converted. This means that the block's instructions (aside from
5745 // stores and instructions that may divide by zero) will now be
5746 // unconditionally executed. For the scalar case, we may not always execute
5747 // the predicated block. Thus, scale the block's cost by the probability of
5748 // executing it.
5749 if (VF == 1 && blockNeedsPredication(BB))
5750 BlockCost.first /= getReciprocalPredBlockProb();
5751
5752 Cost.first += BlockCost.first;
5753 Cost.second |= BlockCost.second;
5754 }
5755
5756 return Cost;
5757}
5758
5759/// Gets Address Access SCEV after verifying that the access pattern
5760/// is loop invariant except the induction variable dependence.
5761///
5762/// This SCEV can be sent to the Target in order to estimate the address
5763/// calculation cost.
5764static const SCEV *getAddressAccessSCEV(
5765 Value *Ptr,
5766 LoopVectorizationLegality *Legal,
5767 PredicatedScalarEvolution &PSE,
5768 const Loop *TheLoop) {
5769
5770 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5771 if (!Gep)
5772 return nullptr;
5773
5774 // We are looking for a gep with all loop invariant indices except for one
5775 // which should be an induction variable.
5776 auto SE = PSE.getSE();
5777 unsigned NumOperands = Gep->getNumOperands();
5778 for (unsigned i = 1; i < NumOperands; ++i) {
5779 Value *Opd = Gep->getOperand(i);
5780 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5781 !Legal->isInductionVariable(Opd))
5782 return nullptr;
5783 }
5784
5785 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5786 return PSE.getSCEV(Ptr);
5787}
5788
5789static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5790 return Legal->hasStride(I->getOperand(0)) ||
5791 Legal->hasStride(I->getOperand(1));
5792}
5793
5794unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5795 unsigned VF) {
5796 assert(VF > 1 && "Scalarization cost of instruction implies vectorization.")((VF > 1 && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5796, __PRETTY_FUNCTION__))
;
5797 Type *ValTy = getMemInstValueType(I);
5798 auto SE = PSE.getSE();
5799
5800 unsigned AS = getLoadStoreAddressSpace(I);
5801 Value *Ptr = getLoadStorePointerOperand(I);
5802 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5803
5804 // Figure out whether the access is strided and get the stride value
5805 // if it's known in compile time
5806 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5807
5808 // Get the cost of the scalar memory instruction and address computation.
5809 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5810
5811 // Don't pass *I here, since it is scalar but will actually be part of a
5812 // vectorized loop where the user of it is a vectorized instruction.
5813 const MaybeAlign Alignment = getLoadStoreAlignment(I);
5814 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5815 Alignment, AS);
5816
5817 // Get the overhead of the extractelement and insertelement instructions
5818 // we might create due to scalarization.
5819 Cost += getScalarizationOverhead(I, VF);
5820
5821 // If we have a predicated store, it may not be executed for each vector
5822 // lane. Scale the cost by the probability of executing the predicated
5823 // block.
5824 if (isPredicatedInst(I)) {
5825 Cost /= getReciprocalPredBlockProb();
5826
5827 if (useEmulatedMaskMemRefHack(I))
5828 // Artificially setting to a high enough value to practically disable
5829 // vectorization with such operations.
5830 Cost = 3000000;
5831 }
5832
5833 return Cost;
5834}
5835
5836unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5837 unsigned VF) {
5838 Type *ValTy = getMemInstValueType(I);
5839 Type *VectorTy = ToVectorTy(ValTy, VF);
5840 Value *Ptr = getLoadStorePointerOperand(I);
5841 unsigned AS = getLoadStoreAddressSpace(I);
5842 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5843
5844 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5845, __PRETTY_FUNCTION__))
5845 "Stride should be 1 or -1 for consecutive memory access")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5845, __PRETTY_FUNCTION__))
;
5846 const MaybeAlign Alignment = getLoadStoreAlignment(I);
5847 unsigned Cost = 0;
5848 if (Legal->isMaskRequired(I))
5849 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5850 Alignment ? Alignment->value() : 0, AS);
5851 else
5852 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5853
5854 bool Reverse = ConsecutiveStride < 0;
5855 if (Reverse)
5856 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5857 return Cost;
5858}
5859
5860unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5861 unsigned VF) {
5862 Type *ValTy = getMemInstValueType(I);
5863 Type *VectorTy = ToVectorTy(ValTy, VF);
5864 const MaybeAlign Alignment = getLoadStoreAlignment(I);
5865 unsigned AS = getLoadStoreAddressSpace(I);
5866 if (isa<LoadInst>(I)) {
5867 return TTI.getAddressComputationCost(ValTy) +
5868 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5869 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5870 }
5871 StoreInst *SI = cast<StoreInst>(I);
5872
5873 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5874 return TTI.getAddressComputationCost(ValTy) +
5875 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5876 (isLoopInvariantStoreValue
5877 ? 0
5878 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5879 VF - 1));
5880}
5881
5882unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5883 unsigned VF) {
5884 Type *ValTy = getMemInstValueType(I);
5885 Type *VectorTy = ToVectorTy(ValTy, VF);
5886 const MaybeAlign Alignment = getLoadStoreAlignment(I);
5887 Value *Ptr = getLoadStorePointerOperand(I);
5888
5889 return TTI.getAddressComputationCost(VectorTy) +
5890 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5891 Legal->isMaskRequired(I),
5892 Alignment ? Alignment->value() : 0);
5893}
5894
5895unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5896 unsigned VF) {
5897 Type *ValTy = getMemInstValueType(I);
5898 Type *VectorTy = ToVectorTy(ValTy, VF);
5899 unsigned AS = getLoadStoreAddressSpace(I);
5900
5901 auto Group = getInterleavedAccessGroup(I);
5902 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5902, __PRETTY_FUNCTION__))
;
5903
5904 unsigned InterleaveFactor = Group->getFactor();
5905 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5906
5907 // Holds the indices of existing members in an interleaved load group.
5908 // An interleaved store group doesn't need this as it doesn't allow gaps.
5909 SmallVector<unsigned, 4> Indices;
5910 if (isa<LoadInst>(I)) {
5911 for (unsigned i = 0; i < InterleaveFactor; i++)
5912 if (Group->getMember(i))
5913 Indices.push_back(i);
5914 }
5915
5916 // Calculate the cost of the whole interleaved group.
5917 bool UseMaskForGaps =
5918 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5919 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5920 I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5921 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5922
5923 if (Group->isReverse()) {
5924 // TODO: Add support for reversed masked interleaved access.
5925 assert(!Legal->isMaskRequired(I) &&((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5926, __PRETTY_FUNCTION__))
5926 "Reverse masked interleaved access not supported.")((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5926, __PRETTY_FUNCTION__))
;
5927 Cost += Group->getNumMembers() *
5928 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5929 }
5930 return Cost;
5931}
5932
5933unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5934 unsigned VF) {
5935 // Calculate scalar cost only. Vectorization cost should be ready at this
5936 // moment.
5937 if (VF == 1) {
5938 Type *ValTy = getMemInstValueType(I);
5939 const MaybeAlign Alignment = getLoadStoreAlignment(I);
5940 unsigned AS = getLoadStoreAddressSpace(I);
5941
5942 return TTI.getAddressComputationCost(ValTy) +
5943 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5944 }
5945 return getWideningCost(I, VF);
5946}
5947
5948LoopVectorizationCostModel::VectorizationCostTy
5949LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5950 // If we know that this instruction will remain uniform, check the cost of
5951 // the scalar version.
5952 if (isUniformAfterVectorization(I, VF))
5953 VF = 1;
5954
5955 if (VF > 1 && isProfitableToScalarize(I, VF))
5956 return VectorizationCostTy(InstsToScalarize[VF][I], false);
5957
5958 // Forced scalars do not have any scalarization overhead.
5959 auto ForcedScalar = ForcedScalars.find(VF);
5960 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5961 auto InstSet = ForcedScalar->second;
5962 if (InstSet.find(I) != InstSet.end())
5963 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5964 }
5965
5966 Type *VectorTy;
5967 unsigned C = getInstructionCost(I, VF, VectorTy);
5968
5969 bool TypeNotScalarized =
5970 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5971 return VectorizationCostTy(C, TypeNotScalarized);
5972}
5973
5974unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5975 unsigned VF) {
5976
5977 if (VF == 1)
5978 return 0;
5979
5980 unsigned Cost = 0;
5981 Type *RetTy = ToVectorTy(I->getType(), VF);
5982 if (!RetTy->isVoidTy() &&
5983 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5984 Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5985
5986 // Some targets keep addresses scalar.
5987 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5988 return Cost;
5989
5990 // Some targets support efficient element stores.
5991 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5992 return Cost;
5993
5994 // Collect operands to consider.
5995 CallInst *CI = dyn_cast<CallInst>(I);
5996 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5997
5998 // Skip operands that do not require extraction/scalarization and do not incur
5999 // any overhead.
6000 return Cost + TTI.getOperandsScalarizationOverhead(
6001 filterExtractingOperands(Ops, VF), VF);
6002}
6003
6004void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6005 if (VF == 1)
6006 return;
6007 NumPredStores = 0;
6008 for (BasicBlock *BB : TheLoop->blocks()) {
6009 // For each instruction in the old loop.
6010 for (Instruction &I : *BB) {
6011 Value *Ptr = getLoadStorePointerOperand(&I);
6012 if (!Ptr)
6013 continue;
6014
6015 // TODO: We should generate better code and update the cost model for
6016 // predicated uniform stores. Today they are treated as any other
6017 // predicated store (see added test cases in
6018 // invariant-store-vectorization.ll).
6019 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6020 NumPredStores++;
6021
6022 if (Legal->isUniform(Ptr) &&
6023 // Conditional loads and stores should be scalarized and predicated.
6024 // isScalarWithPredication cannot be used here since masked
6025 // gather/scatters are not considered scalar with predication.
6026 !Legal->blockNeedsPredication(I.getParent())) {
6027 // TODO: Avoid replicating loads and stores instead of
6028 // relying on instcombine to remove them.
6029 // Load: Scalar load + broadcast
6030 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6031 unsigned Cost = getUniformMemOpCost(&I, VF);
6032 setWideningDecision(&I, VF, CM_Scalarize, Cost);
6033 continue;
6034 }
6035
6036 // We assume that widening is the best solution when possible.
6037 if (memoryInstructionCanBeWidened(&I, VF)) {
6038 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6039 int ConsecutiveStride =
6040 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6041 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6042, __PRETTY_FUNCTION__))
6042 "Expected consecutive stride.")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6042, __PRETTY_FUNCTION__))
;
6043 InstWidening Decision =
6044 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6045 setWideningDecision(&I, VF, Decision, Cost);
6046 continue;
6047 }
6048
6049 // Choose between Interleaving, Gather/Scatter or Scalarization.
6050 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6051 unsigned NumAccesses = 1;
6052 if (isAccessInterleaved(&I)) {
6053 auto Group = getInterleavedAccessGroup(&I);
6054 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6054, __PRETTY_FUNCTION__))
;
6055
6056 // Make one decision for the whole group.
6057 if (getWideningDecision(&I, VF) != CM_Unknown)
6058 continue;
6059
6060 NumAccesses = Group->getNumMembers();
6061 if (interleavedAccessCanBeWidened(&I, VF))
6062 InterleaveCost = getInterleaveGroupCost(&I, VF);
6063 }
6064
6065 unsigned GatherScatterCost =
6066 isLegalGatherOrScatter(&I)
6067 ? getGatherScatterCost(&I, VF) * NumAccesses
6068 : std::numeric_limits<unsigned>::max();
6069
6070 unsigned ScalarizationCost =
6071 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6072
6073 // Choose better solution for the current VF,
6074 // write down this decision and use it during vectorization.
6075 unsigned Cost;
6076 InstWidening Decision;
6077 if (InterleaveCost <= GatherScatterCost &&
6078 InterleaveCost < ScalarizationCost) {
6079 Decision = CM_Interleave;
6080 Cost = InterleaveCost;
6081 } else if (GatherScatterCost < ScalarizationCost) {
6082 Decision = CM_GatherScatter;
6083 Cost = GatherScatterCost;
6084 } else {
6085 Decision = CM_Scalarize;
6086 Cost = ScalarizationCost;
6087 }
6088 // If the instructions belongs to an interleave group, the whole group
6089 // receives the same decision. The whole group receives the cost, but
6090 // the cost will actually be assigned to one instruction.
6091 if (auto Group = getInterleavedAccessGroup(&I))
6092 setWideningDecision(Group, VF, Decision, Cost);
6093 else
6094 setWideningDecision(&I, VF, Decision, Cost);
6095 }
6096 }
6097
6098 // Make sure that any load of address and any other address computation
6099 // remains scalar unless there is gather/scatter support. This avoids
6100 // inevitable extracts into address registers, and also has the benefit of
6101 // activating LSR more, since that pass can't optimize vectorized
6102 // addresses.
6103 if (TTI.prefersVectorizedAddressing())
6104 return;
6105
6106 // Start with all scalar pointer uses.
6107 SmallPtrSet<Instruction *, 8> AddrDefs;
6108 for (BasicBlock *BB : TheLoop->blocks())
6109 for (Instruction &I : *BB) {
6110 Instruction *PtrDef =
6111 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6112 if (PtrDef && TheLoop->contains(PtrDef) &&
6113 getWideningDecision(&I, VF) != CM_GatherScatter)
6114 AddrDefs.insert(PtrDef);
6115 }
6116
6117 // Add all instructions used to generate the addresses.
6118 SmallVector<Instruction *, 4> Worklist;
6119 for (auto *I : AddrDefs)
6120 Worklist.push_back(I);
6121 while (!Worklist.empty()) {
6122 Instruction *I = Worklist.pop_back_val();
6123 for (auto &Op : I->operands())
6124 if (auto *InstOp = dyn_cast<Instruction>(Op))
6125 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6126 AddrDefs.insert(InstOp).second)
6127 Worklist.push_back(InstOp);
6128 }
6129
6130 for (auto *I : AddrDefs) {
6131 if (isa<LoadInst>(I)) {
6132 // Setting the desired widening decision should ideally be handled in
6133 // by cost functions, but since this involves the task of finding out
6134 // if the loaded register is involved in an address computation, it is
6135 // instead changed here when we know this is the case.
6136 InstWidening Decision = getWideningDecision(I, VF);
6137 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6138 // Scalarize a widened load of address.
6139 setWideningDecision(I, VF, CM_Scalarize,
6140 (VF * getMemoryInstructionCost(I, 1)));
6141 else if (auto Group = getInterleavedAccessGroup(I)) {
6142 // Scalarize an interleave group of address loads.
6143 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6144 if (Instruction *Member = Group->getMember(I))
6145 setWideningDecision(Member, VF, CM_Scalarize,
6146 (VF * getMemoryInstructionCost(Member, 1)));
6147 }
6148 }
6149 } else
6150 // Make sure I gets scalarized and a cost estimate without
6151 // scalarization overhead.
6152 ForcedScalars[VF].insert(I);
6153 }
6154}
6155
6156unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6157 unsigned VF,
6158 Type *&VectorTy) {
6159 Type *RetTy = I->getType();
6160 if (canTruncateToMinimalBitwidth(I, VF))
6161 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6162 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6163 auto SE = PSE.getSE();
6164
6165 // TODO: We need to estimate the cost of intrinsic calls.
6166 switch (I->getOpcode()) {
6167 case Instruction::GetElementPtr:
6168 // We mark this instruction as zero-cost because the cost of GEPs in
6169 // vectorized code depends on whether the corresponding memory instruction
6170 // is scalarized or not. Therefore, we handle GEPs with the memory
6171 // instruction cost.
6172 return 0;
6173 case Instruction::Br: {
6174 // In cases of scalarized and predicated instructions, there will be VF
6175 // predicated blocks in the vectorized loop. Each branch around these
6176 // blocks requires also an extract of its vector compare i1 element.
6177 bool ScalarPredicatedBB = false;
6178 BranchInst *BI = cast<BranchInst>(I);
6179 if (VF > 1 && BI->isConditional() &&
6180 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6181 PredicatedBBsAfterVectorization.end() ||
6182 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6183 PredicatedBBsAfterVectorization.end()))
6184 ScalarPredicatedBB = true;
6185
6186 if (ScalarPredicatedBB) {
6187 // Return cost for branches around scalarized and predicated blocks.
6188 Type *Vec_i1Ty =
6189 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6190 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6191 (TTI.getCFInstrCost(Instruction::Br) * VF));
6192 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6193 // The back-edge branch will remain, as will all scalar branches.
6194 return TTI.getCFInstrCost(Instruction::Br);
6195 else
6196 // This branch will be eliminated by if-conversion.
6197 return 0;
6198 // Note: We currently assume zero cost for an unconditional branch inside
6199 // a predicated block since it will become a fall-through, although we
6200 // may decide in the future to call TTI for all branches.
6201 }
6202 case Instruction::PHI: {
6203 auto *Phi = cast<PHINode>(I);
6204
6205 // First-order recurrences are replaced by vector shuffles inside the loop.
6206 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6207 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6208 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6209 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6210
6211 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6212 // converted into select instructions. We require N - 1 selects per phi
6213 // node, where N is the number of incoming values.
6214 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6215 return (Phi->getNumIncomingValues() - 1) *
6216 TTI.getCmpSelInstrCost(
6217 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6218 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6219
6220 return TTI.getCFInstrCost(Instruction::PHI);
6221 }
6222 case Instruction::UDiv:
6223 case Instruction::SDiv:
6224 case Instruction::URem:
6225 case Instruction::SRem:
6226 // If we have a predicated instruction, it may not be executed for each
6227 // vector lane. Get the scalarization cost and scale this amount by the
6228 // probability of executing the predicated block. If the instruction is not
6229 // predicated, we fall through to the next case.
6230 if (VF > 1 && isScalarWithPredication(I)) {
6231 unsigned Cost = 0;
6232
6233 // These instructions have a non-void type, so account for the phi nodes
6234 // that we will create. This cost is likely to be zero. The phi node
6235 // cost, if any, should be scaled by the block probability because it
6236 // models a copy at the end of each predicated block.
6237 Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6238
6239 // The cost of the non-predicated instruction.
6240 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6241
6242 // The cost of insertelement and extractelement instructions needed for
6243 // scalarization.
6244 Cost += getScalarizationOverhead(I, VF);
6245
6246 // Scale the cost by the probability of executing the predicated blocks.
6247 // This assumes the predicated block for each vector lane is equally
6248 // likely.
6249 return Cost / getReciprocalPredBlockProb();
6250 }
6251 LLVM_FALLTHROUGH[[gnu::fallthrough]];
6252 case Instruction::Add:
6253 case Instruction::FAdd:
6254 case Instruction::Sub:
6255 case Instruction::FSub:
6256 case Instruction::Mul:
6257 case Instruction::FMul:
6258 case Instruction::FDiv:
6259 case Instruction::FRem:
6260 case Instruction::Shl:
6261 case Instruction::LShr:
6262 case Instruction::AShr:
6263 case Instruction::And:
6264 case Instruction::Or:
6265 case Instruction::Xor: {
6266 // Since we will replace the stride by 1 the multiplication should go away.
6267 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6268 return 0;
6269 // Certain instructions can be cheaper to vectorize if they have a constant
6270 // second vector operand. One example of this are shifts on x86.
6271 Value *Op2 = I->getOperand(1);
6272 TargetTransformInfo::OperandValueProperties Op2VP;
6273 TargetTransformInfo::OperandValueKind Op2VK =
6274 TTI.getOperandInfo(Op2, Op2VP);
6275 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6276 Op2VK = TargetTransformInfo::OK_UniformValue;
6277
6278 SmallVector<const Value *, 4> Operands(I->operand_values());
6279 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6280 return N * TTI.getArithmeticInstrCost(
6281 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6282 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6283 }
6284 case Instruction::FNeg: {
6285 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6286 return N * TTI.getArithmeticInstrCost(
6287 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6288 TargetTransformInfo::OK_AnyValue,
6289 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6290 I->getOperand(0), I);
6291 }
6292 case Instruction::Select: {
6293 SelectInst *SI = cast<SelectInst>(I);
6294 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6295 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6296 Type *CondTy = SI->getCondition()->getType();
6297 if (!ScalarCond)
6298 CondTy = VectorType::get(CondTy, VF);
6299
6300 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6301 }
6302 case Instruction::ICmp:
6303 case Instruction::FCmp: {
6304 Type *ValTy = I->getOperand(0)->getType();
6305 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6306 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6307 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6308 VectorTy = ToVectorTy(ValTy, VF);
6309 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6310 }
6311 case Instruction::Store:
6312 case Instruction::Load: {
6313 unsigned Width = VF;
6314 if (Width > 1) {
6315 InstWidening Decision = getWideningDecision(I, Width);
6316 assert(Decision != CM_Unknown &&((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6317, __PRETTY_FUNCTION__))
6317 "CM decision should be taken at this point")((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6317, __PRETTY_FUNCTION__))
;
6318 if (Decision == CM_Scalarize)
6319 Width = 1;
6320 }
6321 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6322 return getMemoryInstructionCost(I, VF);
6323 }
6324 case Instruction::ZExt:
6325 case Instruction::SExt:
6326 case Instruction::FPToUI:
6327 case Instruction::FPToSI:
6328 case Instruction::FPExt:
6329 case Instruction::PtrToInt:
6330 case Instruction::IntToPtr:
6331 case Instruction::SIToFP:
6332 case Instruction::UIToFP:
6333 case Instruction::Trunc:
6334 case Instruction::FPTrunc:
6335 case Instruction::BitCast: {
6336 // We optimize the truncation of induction variables having constant
6337 // integer steps. The cost of these truncations is the same as the scalar
6338 // operation.
6339 if (isOptimizableIVTruncate(I, VF)) {
6340 auto *Trunc = cast<TruncInst>(I);
6341 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6342 Trunc->getSrcTy(), Trunc);
6343 }
6344
6345 Type *SrcScalarTy = I->getOperand(0)->getType();
6346 Type *SrcVecTy =
6347 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6348 if (canTruncateToMinimalBitwidth(I, VF)) {
6349 // This cast is going to be shrunk. This may remove the cast or it might
6350 // turn it into slightly different cast. For example, if MinBW == 16,
6351 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6352 //
6353 // Calculate the modified src and dest types.
6354 Type *MinVecTy = VectorTy;
6355 if (I->getOpcode() == Instruction::Trunc) {
6356 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6357 VectorTy =
6358 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6359 } else if (I->getOpcode() == Instruction::ZExt ||
6360 I->getOpcode() == Instruction::SExt) {
6361 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6362 VectorTy =
6363 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6364 }
6365 }
6366
6367 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6368 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6369 }
6370 case Instruction::Call: {
6371 bool NeedToScalarize;
6372 CallInst *CI = cast<CallInst>(I);
6373 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6374 if (getVectorIntrinsicIDForCall(CI, TLI))
6375 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6376 return CallCost;
6377 }
6378 default:
6379 // The cost of executing VF copies of the scalar instruction. This opcode
6380 // is unknown. Assume that it is the same as 'mul'.
6381 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6382 getScalarizationOverhead(I, VF);
6383 } // end of switch.
6384}
6385
6386char LoopVectorize::ID = 0;
6387
6388static const char lv_name[] = "Loop Vectorization";
6389
6390INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
6391INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6392INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
6393INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6394INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
6395INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6396INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
6397INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6398INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6399INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
6400INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
6401INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6402INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6403INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
6404INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
6405INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
6406
6407namespace llvm {
6408
6409Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6410
6411Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6412 bool VectorizeOnlyWhenForced) {
6413 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6414}
6415
6416} // end namespace llvm
6417
6418bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6419 // Check if the pointer operand of a load or store instruction is
6420 // consecutive.
6421 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6422 return Legal->isConsecutivePtr(Ptr);
6423 return false;
6424}
6425
6426void LoopVectorizationCostModel::collectValuesToIgnore() {
6427 // Ignore ephemeral values.
6428 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6429
6430 // Ignore type-promoting instructions we identified during reduction
6431 // detection.
6432 for (auto &Reduction : Legal->getReductionVars()) {
6433 RecurrenceDescriptor &RedDes = Reduction.second;
6434 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6435 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6436 }
6437 // Ignore type-casting instructions we identified during induction
6438 // detection.
6439 for (auto &Induction : Legal->getInductionVars()) {
6440 InductionDescriptor &IndDes = Induction.second;
6441 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6442 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6443 }
6444}
6445
6446// TODO: we could return a pair of values that specify the max VF and
6447// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6448// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6449// doesn't have a cost model that can choose which plan to execute if
6450// more than one is generated.
6451static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6452 LoopVectorizationCostModel &CM) {
6453 unsigned WidestType;
6454 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6455 return WidestVectorRegBits / WidestType;
6456}
6457
6458VectorizationFactor
6459LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6460 unsigned VF = UserVF;
6461 // Outer loop handling: They may require CFG and instruction level
6462 // transformations before even evaluating whether vectorization is profitable.
6463 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6464 // the vectorization pipeline.
6465 if (!OrigLoop->empty()) {
6466 // If the user doesn't provide a vectorization factor, determine a
6467 // reasonable one.
6468 if (!UserVF) {
6469 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6470 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
6471
6472 // Make sure we have a VF > 1 for stress testing.
6473 if (VPlanBuildStressTest && VF < 2) {
6474 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
6475 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
6476 VF = 4;
6477 }
6478 }
6479 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6479, __PRETTY_FUNCTION__))
;
6480 assert(isPowerOf2_32(VF) && "VF needs to be a power of two")((isPowerOf2_32(VF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6480, __PRETTY_FUNCTION__))
;
6481 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false)
6482 << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false)
;
6483 buildVPlans(VF, VF);
6484
6485 // For VPlan build stress testing, we bail out after VPlan construction.
6486 if (VPlanBuildStressTest)
6487 return VectorizationFactor::Disabled();
6488
6489 return {VF, 0};
6490 }
6491
6492 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6493 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
6494 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
6495 return VectorizationFactor::Disabled();
6496}
6497
6498Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6499 assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6499, __PRETTY_FUNCTION__))
;
6500 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6501 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6502 return None;
6503
6504 // Invalidate interleave groups if all blocks of loop will be predicated.
6505 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6506 !useMaskedInterleavedAccesses(*TTI)) {
6507 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6508 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6509 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
6510 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
6511 CM.InterleaveInfo.reset();
6512 }
6513
6514 if (UserVF) {
6515 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (false)
;
6516 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6516, __PRETTY_FUNCTION__))
;
6517 // Collect the instructions (and their associated costs) that will be more
6518 // profitable to scalarize.
6519 CM.selectUserVectorizationFactor(UserVF);
6520 buildVPlansWithVPRecipes(UserVF, UserVF);
6521 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
6522 return {{UserVF, 0}};
6523 }
6524
6525 unsigned MaxVF = MaybeMaxVF.getValue();
6526 assert(MaxVF != 0 && "MaxVF is zero.")((MaxVF != 0 && "MaxVF is zero.") ? static_cast<void
> (0) : __assert_fail ("MaxVF != 0 && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6526, __PRETTY_FUNCTION__))
;
6527
6528 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6529 // Collect Uniform and Scalar instructions after vectorization with VF.
6530 CM.collectUniformsAndScalars(VF);
6531
6532 // Collect the instructions (and their associated costs) that will be more
6533 // profitable to scalarize.
6534 if (VF > 1)
6535 CM.collectInstsToScalarize(VF);
6536 }
6537
6538 buildVPlansWithVPRecipes(1, MaxVF);
6539 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
6540 if (MaxVF == 1)
6541 return VectorizationFactor::Disabled();
6542
6543 // Select the optimal vectorization factor.
6544 return CM.selectVectorizationFactor(MaxVF);
6545}
6546
6547void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6548 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
6549 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
;
6550 BestVF = VF;
6551 BestUF = UF;
6552
6553 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6554 return !Plan->hasVF(VF);
6555 });
6556 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")((VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6556, __PRETTY_FUNCTION__))
;
6557}
6558
6559void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6560 DominatorTree *DT) {
6561 // Perform the actual loop transformation.
6562
6563 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6564 VPCallbackILV CallbackILV(ILV);
6565
6566 VPTransformState State{BestVF, BestUF, LI,
6567 DT, ILV.Builder, ILV.VectorLoopValueMap,
6568 &ILV, CallbackILV};
6569 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6570 State.TripCount = ILV.getOrCreateTripCount(nullptr);
6571
6572 //===------------------------------------------------===//
6573 //
6574 // Notice: any optimization or new instruction that go
6575 // into the code below should also be implemented in
6576 // the cost-model.
6577 //
6578 //===------------------------------------------------===//
6579
6580 // 2. Copy and widen instructions from the old loop into the new loop.
6581 assert(VPlans.size() == 1 && "Not a single VPlan to execute.")((VPlans.size() == 1 && "Not a single VPlan to execute."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6581, __PRETTY_FUNCTION__))
;
6582 VPlans.front()->execute(&State);
6583
6584 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6585 // predication, updating analyses.
6586 ILV.fixVectorizedLoop();
6587}
6588
6589void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6590 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6591 BasicBlock *Latch = OrigLoop->getLoopLatch();
6592
6593 // We create new control-flow for the vectorized loop, so the original
6594 // condition will be dead after vectorization if it's only used by the
6595 // branch.
6596 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6597 if (Cmp && Cmp->hasOneUse())
6598 DeadInstructions.insert(Cmp);
6599
6600 // We create new "steps" for induction variable updates to which the original
6601 // induction variables map. An original update instruction will be dead if
6602 // all its users except the induction variable are dead.
6603 for (auto &Induction : Legal->getInductionVars()) {
6604 PHINode *Ind = Induction.first;
6605 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6606 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6607 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6608 DeadInstructions.end();
6609 }))
6610 DeadInstructions.insert(IndUpdate);
6611
6612 // We record as "Dead" also the type-casting instructions we had identified
6613 // during induction analysis. We don't need any handling for them in the
6614 // vectorized loop because we have proven that, under a proper runtime
6615 // test guarding the vectorized loop, the value of the phi, and the casted
6616 // value of the phi, are the same. The last instruction in this casting chain
6617 // will get its scalar/vector/widened def from the scalar/vector/widened def
6618 // of the respective phi node. Any other casts in the induction def-use chain
6619 // have no other uses outside the phi update chain, and will be ignored.
6620 InductionDescriptor &IndDes = Induction.second;
6621 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6622 DeadInstructions.insert(Casts.begin(), Casts.end());
6623 }
6624}
6625
6626Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6627
6628Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6629
6630Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6631 Instruction::BinaryOps BinOp) {
6632 // When unrolling and the VF is 1, we only need to add a simple scalar.
6633 Type *Ty = Val->getType();
6634 assert(!Ty->isVectorTy() && "Val must be a scalar")((!Ty->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6634, __PRETTY_FUNCTION__))
;
6635
6636 if (Ty->isFloatingPointTy()) {
6637 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6638
6639 // Floating point operations had to be 'fast' to enable the unrolling.
6640 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6641 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6642 }
6643 Constant *C = ConstantInt::get(Ty, StartIdx);
6644 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6645}
6646
6647static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6648 SmallVector<Metadata *, 4> MDs;
6649 // Reserve first location for self reference to the LoopID metadata node.
6650 MDs.push_back(nullptr);
6651 bool IsUnrollMetadata = false;
6652 MDNode *LoopID = L->getLoopID();
6653 if (LoopID) {
6654 // First find existing loop unrolling disable metadata.
6655 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6656 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6657 if (MD) {
6658 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6659 IsUnrollMetadata =
6660 S && S->getString().startswith("llvm.loop.unroll.disable");
6661 }
6662 MDs.push_back(LoopID->getOperand(i));
6663 }
6664 }
6665
6666 if (!IsUnrollMetadata) {
6667 // Add runtime unroll disable metadata.
6668 LLVMContext &Context = L->getHeader()->getContext();
6669 SmallVector<Metadata *, 1> DisableOperands;
6670 DisableOperands.push_back(
6671 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6672 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6673 MDs.push_back(DisableNode);
6674 MDNode *NewLoopID = MDNode::get(Context, MDs);
6675 // Set operand 0 to refer to the loop id itself.
6676 NewLoopID->replaceOperandWith(0, NewLoopID);
6677 L->setLoopID(NewLoopID);
6678 }
6679}
6680
6681bool LoopVectorizationPlanner::getDecisionAndClampRange(
6682 const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6683 assert(Range.End > Range.Start && "Trying to test an empty VF range.")((Range.End > Range.Start && "Trying to test an empty VF range."
) ? static_cast<void> (0) : __assert_fail ("Range.End > Range.Start && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6683, __PRETTY_FUNCTION__))
;
6684 bool PredicateAtRangeStart = Predicate(Range.Start);
6685
6686 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6687 if (Predicate(TmpVF) != PredicateAtRangeStart) {
6688 Range.End = TmpVF;
6689 break;
6690 }
6691
6692 return PredicateAtRangeStart;
6693}
6694
6695/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6696/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6697/// of VF's starting at a given VF and extending it as much as possible. Each
6698/// vectorization decision can potentially shorten this sub-range during
6699/// buildVPlan().
6700void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6701 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6702 VFRange SubRange = {VF, MaxVF + 1};
6703 VPlans.push_back(buildVPlan(SubRange));
6704 VF = SubRange.End;
6705 }
6706}
6707
6708VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6709 VPlanPtr &Plan) {
6710 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")((is_contained(predecessors(Dst), Src) && "Invalid edge"
) ? static_cast<void> (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6710, __PRETTY_FUNCTION__))
;
6711
6712 // Look for cached value.
6713 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6714 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6715 if (ECEntryIt != EdgeMaskCache.end())
6716 return ECEntryIt->second;
6717
6718 VPValue *SrcMask = createBlockInMask(Src, Plan);
6719
6720 // The terminator has to be a branch inst!
6721 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6722 assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6722, __PRETTY_FUNCTION__))
;
6723
6724 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6725 return EdgeMaskCache[Edge] = SrcMask;
6726
6727 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6728 assert(EdgeMask && "No Edge Mask found for condition")((EdgeMask && "No Edge Mask found for condition") ? static_cast
<void> (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6728, __PRETTY_FUNCTION__))
;
6729
6730 if (BI->getSuccessor(0) != Dst)
6731 EdgeMask = Builder.createNot(EdgeMask);
6732
6733 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6734 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6735
6736 return EdgeMaskCache[Edge] = EdgeMask;
6737}
6738
6739VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6740 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6740, __PRETTY_FUNCTION__))
;
6
Assuming the condition is true
7
'?' condition is true
6741
6742 // Look for cached value.
6743 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6744 if (BCEntryIt != BlockMaskCache.end())
8
Assuming the condition is false
9
Taking false branch
6745 return BCEntryIt->second;
6746
6747 // All-one mask is modelled as no-mask following the convention for masked
6748 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6749 VPValue *BlockMask = nullptr;
6750
6751 if (OrigLoop->getHeader() == BB) {
10
Assuming the condition is false
11
Taking false branch
6752 if (!CM.blockNeedsPredication(BB))
6753 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6754
6755 // Introduce the early-exit compare IV <= BTC to form header block mask.
6756 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6757 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6758 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6759 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6760 return BlockMaskCache[BB] = BlockMask;
6761 }
6762
6763 // This is the block mask. We OR all incoming edges.
6764 for (auto *Predecessor : predecessors(BB)) {
6765 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6766 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
12
Assuming 'EdgeMask' is non-null
13
Taking false branch
16
Assuming 'EdgeMask' is non-null
17
Taking false branch
28
Assuming 'EdgeMask' is null
29
Taking true branch
6767 return BlockMaskCache[BB] = EdgeMask;
30
Potential leak of memory pointed to by 'BlockMask'
6768
6769 if (!BlockMask
13.1
'BlockMask' is null
17.1
'BlockMask' is non-null
13.1
'BlockMask' is null
17.1
'BlockMask' is non-null
) { // BlockMask has its initialized nullptr value.
14
Taking true branch
18
Taking false branch
6770 BlockMask = EdgeMask;
6771 continue;
15
Execution continues on line 6764
6772 }
6773
6774 BlockMask = Builder.createOr(BlockMask, EdgeMask);
19
Calling 'VPBuilder::createOr'
27
Returned allocated memory
6775 }
6776
6777 return BlockMaskCache[BB] = BlockMask;
6778}
6779
6780VPWidenMemoryInstructionRecipe *
6781VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6782 VPlanPtr &Plan) {
6783 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
1
Assuming 'I' is a 'LoadInst'
6784 return nullptr;
6785
6786 auto willWiden = [&](unsigned VF) -> bool {
6787 if (VF == 1)
6788 return false;
6789 LoopVectorizationCostModel::InstWidening Decision =
6790 CM.getWideningDecision(I, VF);
6791 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6792, __PRETTY_FUNCTION__))
6792 "CM decision should be taken at this point.")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6792, __PRETTY_FUNCTION__))
;
6793 if (Decision == LoopVectorizationCostModel::CM_Interleave)
6794 return true;
6795 if (CM.isScalarAfterVectorization(I, VF) ||
6796 CM.isProfitableToScalarize(I, VF))
6797 return false;
6798 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6799 };
6800
6801 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
2
Assuming the condition is false
3
Taking false branch
6802 return nullptr;
6803
6804 VPValue *Mask = nullptr;
6805 if (Legal->isMaskRequired(I))
4
Taking true branch
6806 Mask = createBlockInMask(I->getParent(), Plan);
5
Calling 'VPRecipeBuilder::createBlockInMask'
6807
6808 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6809 return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6810}
6811
6812VPWidenIntOrFpInductionRecipe *
6813VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6814 if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6815 // Check if this is an integer or fp induction. If so, build the recipe that
6816 // produces its scalar and vector values.
6817 InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6818 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6819 II.getKind() == InductionDescriptor::IK_FpInduction)
6820 return new VPWidenIntOrFpInductionRecipe(Phi);
6821
6822 return nullptr;
6823 }
6824
6825 // Optimize the special case where the source is a constant integer
6826 // induction variable. Notice that we can only optimize the 'trunc' case
6827 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6828 // (c) other casts depend on pointer size.
6829
6830 // Determine whether \p K is a truncation based on an induction variable that
6831 // can be optimized.
6832 auto isOptimizableIVTruncate =
6833 [&](Instruction *K) -> std::function<bool(unsigned)> {
6834 return
6835 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6836 };
6837
6838 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6839 isOptimizableIVTruncate(I), Range))
6840 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6841 cast<TruncInst>(I));
6842 return nullptr;
6843}
6844
6845VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6846 PHINode *Phi = dyn_cast<PHINode>(I);
6847 if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6848 return nullptr;
6849
6850 // We know that all PHIs in non-header blocks are converted into selects, so
6851 // we don't have to worry about the insertion order and we can just use the
6852 // builder. At this point we generate the predication tree. There may be
6853 // duplications since this is a simple recursive scan, but future
6854 // optimizations will clean it up.
6855
6856 SmallVector<VPValue *, 2> Masks;
6857 unsigned NumIncoming = Phi->getNumIncomingValues();
6858 for (unsigned In = 0; In < NumIncoming; In++) {
6859 VPValue *EdgeMask =
6860 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6861 assert((EdgeMask || NumIncoming == 1) &&(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6862, __PRETTY_FUNCTION__))
6862 "Multiple predecessors with one having a full mask")(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6862, __PRETTY_FUNCTION__))
;
6863 if (EdgeMask)
6864 Masks.push_back(EdgeMask);
6865 }
6866 return new VPBlendRecipe(Phi, Masks);
6867}
6868
6869bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6870 VFRange &Range) {
6871
6872 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6873 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6874
6875 if (IsPredicated)
6876 return false;
6877
6878 auto IsVectorizableOpcode = [](unsigned Opcode) {
6879 switch (Opcode) {
6880 case Instruction::Add:
6881 case Instruction::And:
6882 case Instruction::AShr:
6883 case Instruction::BitCast:
6884 case Instruction::Br:
6885 case Instruction::Call:
6886 case Instruction::FAdd:
6887 case Instruction::FCmp:
6888 case Instruction::FDiv:
6889 case Instruction::FMul:
6890 case Instruction::FNeg:
6891 case Instruction::FPExt:
6892 case Instruction::FPToSI:
6893 case Instruction::FPToUI:
6894 case Instruction::FPTrunc:
6895 case Instruction::FRem:
6896 case Instruction::FSub:
6897 case Instruction::ICmp:
6898 case Instruction::IntToPtr:
6899 case Instruction::Load:
6900 case Instruction::LShr:
6901 case Instruction::Mul:
6902 case Instruction::Or:
6903 case Instruction::PHI:
6904 case Instruction::PtrToInt:
6905 case Instruction::SDiv:
6906 case Instruction::Select:
6907 case Instruction::SExt:
6908 case Instruction::Shl:
6909 case Instruction::SIToFP:
6910 case Instruction::SRem:
6911 case Instruction::Store:
6912 case Instruction::Sub:
6913 case Instruction::Trunc:
6914 case Instruction::UDiv:
6915 case Instruction::UIToFP:
6916 case Instruction::URem:
6917 case Instruction::Xor:
6918 case Instruction::ZExt:
6919 return true;
6920 }
6921 return false;
6922 };
6923
6924 if (!IsVectorizableOpcode(I->getOpcode()))
6925 return false;
6926
6927 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6928 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6929 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6930 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6931 return false;
6932 }
6933
6934 auto willWiden = [&](unsigned VF) -> bool {
6935 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6936 CM.isProfitableToScalarize(I, VF)))
6937 return false;
6938 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6939 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6940 // The following case may be scalarized depending on the VF.
6941 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6942 // version of the instruction.
6943 // Is it beneficial to perform intrinsic call compared to lib call?
6944 bool NeedToScalarize;
6945 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6946 bool UseVectorIntrinsic =
6947 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6948 return UseVectorIntrinsic || !NeedToScalarize;
6949 }
6950 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6951 assert(CM.getWideningDecision(I, VF) ==((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__))
6952 LoopVectorizationCostModel::CM_Scalarize &&((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__))
6953 "Memory widening decisions should have been taken care by now")((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__))
;
6954 return false;
6955 }
6956 return true;
6957 };
6958
6959 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6960 return false;
6961 // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6962 // to avoid having to split recipes later.
6963 bool IsSingleton = Ingredient2Recipe.count(I);
6964
6965 // Success: widen this instruction.
6966
6967 // Use the default widening recipe. We optimize the common case where
6968 // consecutive instructions can be represented by a single recipe.
6969 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6970 LastExtensibleRecipe->appendInstruction(I))
6971 return true;
6972
6973 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6974 if (!IsSingleton)
6975 LastExtensibleRecipe = WidenRecipe;
6976 setRecipe(I, WidenRecipe);
6977 VPBB->appendRecipe(WidenRecipe);
6978 return true;
6979}
6980
6981VPBasicBlock *VPRecipeBuilder::handleReplication(
6982 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6983 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6984 VPlanPtr &Plan) {
6985 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6986 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6987 Range);
6988
6989 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6990 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6991
6992 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6993 setRecipe(I, Recipe);
6994
6995 // Find if I uses a predicated instruction. If so, it will use its scalar
6996 // value. Avoid hoisting the insert-element which packs the scalar value into
6997 // a vector value, as that happens iff all users use the vector value.
6998 for (auto &Op : I->operands())
6999 if (auto *PredInst = dyn_cast<Instruction>(Op))
7000 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7001 PredInst2Recipe[PredInst]->setAlsoPack(false);
7002
7003 // Finalize the recipe for Instr, first if it is not predicated.
7004 if (!IsPredicated) {
7005 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
7006 VPBB->appendRecipe(Recipe);
7007 return VPBB;
7008 }
7009 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
7010 assert(VPBB->getSuccessors().empty() &&((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7011, __PRETTY_FUNCTION__))
7011 "VPBB has successors when handling predicated replication.")((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7011, __PRETTY_FUNCTION__))
;
7012 // Record predicated instructions for above packing optimizations.
7013 PredInst2Recipe[I] = Recipe;
7014 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7015 VPBlockUtils::insertBlockAfter(Region, VPBB);
7016 auto *RegSucc = new VPBasicBlock();
7017 VPBlockUtils::insertBlockAfter(RegSucc, Region);
7018 return RegSucc;
7019}
7020
7021VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7022 VPRecipeBase *PredRecipe,
7023 VPlanPtr &Plan) {
7024 // Instructions marked for predication are replicated and placed under an
7025 // if-then construct to prevent side-effects.
7026
7027 // Generate recipes to compute the block mask for this region.
7028 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7029
7030 // Build the triangular if-then region.
7031 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7032 assert(Instr->getParent() && "Predicated instruction not in any basic block")((Instr->getParent() && "Predicated instruction not in any basic block"
) ? static_cast<void> (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7032, __PRETTY_FUNCTION__))
;
7033 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7034 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7035 auto *PHIRecipe =
7036 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7037 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7038 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7039 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7040
7041 // Note: first set Entry as region entry and then connect successors starting
7042 // from it in order, to propagate the "parent" of each VPBasicBlock.
7043 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7044 VPBlockUtils::connectBlocks(Pred, Exit);
7045
7046 return Region;
7047}
7048
7049bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7050 VPlanPtr &Plan, VPBasicBlock *VPBB) {
7051 VPRecipeBase *Recipe = nullptr;
7052
7053 // First, check for specific widening recipes that deal with memory
7054 // operations, inductions and Phi nodes.
7055 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7056 (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7057 (Recipe = tryToBlend(Instr, Plan)) ||
7058 (isa<PHINode>(Instr) &&
7059 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7060 setRecipe(Instr, Recipe);
7061 VPBB->appendRecipe(Recipe);
7062 return true;
7063 }
7064
7065 // Handle GEP widening.
7066 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7067 auto Scalarize = [&](unsigned VF) {
7068 return CM.isScalarWithPredication(Instr, VF) ||
7069 CM.isScalarAfterVectorization(Instr, VF) ||
7070 CM.isProfitableToScalarize(Instr, VF);
7071 };
7072 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7073 return false;
7074 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7075 setRecipe(Instr, Recipe);
7076 VPBB->appendRecipe(Recipe);
7077 return true;
7078 }
7079
7080 // Check if Instr is to be widened by a general VPWidenRecipe, after
7081 // having first checked for specific widening recipes.
7082 if (tryToWiden(Instr, VPBB, Range))
7083 return true;
7084
7085 return false;
7086}
7087
7088void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7089 unsigned MaxVF) {
7090 assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7090, __PRETTY_FUNCTION__))
;
7091
7092 // Collect conditions feeding internal conditional branches; they need to be
7093 // represented in VPlan for it to model masking.
7094 SmallPtrSet<Value *, 1> NeedDef;
7095
7096 auto *Latch = OrigLoop->getLoopLatch();
7097 for (BasicBlock *BB : OrigLoop->blocks()) {
7098 if (BB == Latch)
7099 continue;
7100 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7101 if (Branch && Branch->isConditional())
7102 NeedDef.insert(Branch->getCondition());
7103 }
7104
7105 // If the tail is to be folded by masking, the primary induction variable
7106 // needs to be represented in VPlan for it to model early-exit masking.
7107 // Also, both the Phi and the live-out instruction of each reduction are
7108 // required in order to introduce a select between them in VPlan.
7109 if (CM.foldTailByMasking()) {
7110 NeedDef.insert(Legal->getPrimaryInduction());
7111 for (auto &Reduction : Legal->getReductionVars()) {
7112 NeedDef.insert(Reduction.first);
7113 NeedDef.insert(Reduction.second.getLoopExitInstr());
7114 }
7115 }
7116
7117 // Collect instructions from the original loop that will become trivially dead
7118 // in the vectorized loop. We don't need to vectorize these instructions. For
7119 // example, original induction update instructions can become dead because we
7120 // separately emit induction "steps" when generating code for the new loop.
7121 // Similarly, we create a new latch condition when setting up the structure
7122 // of the new loop, so the old one can become dead.
7123 SmallPtrSet<Instruction *, 4> DeadInstructions;
7124 collectTriviallyDeadInstructions(DeadInstructions);
7125
7126 // Add assume instructions we need to drop to DeadInstructions, to prevent
7127 // them from being added to the VPlan.
7128 // TODO: We only need to drop assumes in blocks that get flattend. If the
7129 // control flow is preserved, we should keep them.
7130 auto &ConditionalAssumes = Legal->getConditionalAssumes();
7131 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7132
7133 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7134 // Dead instructions do not need sinking. Remove them from SinkAfter.
7135 for (Instruction *I : DeadInstructions)
7136 SinkAfter.erase(I);
7137
7138 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7139 VFRange SubRange = {VF, MaxVF + 1};
7140 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7141 DeadInstructions, SinkAfter));
7142 VF = SubRange.End;
7143 }
7144}
7145
7146VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7147 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7148 SmallPtrSetImpl<Instruction *> &DeadInstructions,
7149 const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7150
7151 // Hold a mapping from predicated instructions to their recipes, in order to
7152 // fix their AlsoPack behavior if a user is determined to replicate and use a
7153 // scalar instead of vector value.
7154 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7155
7156 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7157
7158 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7159
7160 // ---------------------------------------------------------------------------
7161 // Pre-construction: record ingredients whose recipes we'll need to further
7162 // process after constructing the initial VPlan.
7163 // ---------------------------------------------------------------------------
7164
7165 // Mark instructions we'll need to sink later and their targets as
7166 // ingredients whose recipe we'll need to record.
7167 for (auto &Entry : SinkAfter) {
7168 RecipeBuilder.recordRecipeOf(Entry.first);
7169 RecipeBuilder.recordRecipeOf(Entry.second);
7170 }
7171
7172 // For each interleave group which is relevant for this (possibly trimmed)
7173 // Range, add it to the set of groups to be later applied to the VPlan and add
7174 // placeholders for its members' Recipes which we'll be replacing with a
7175 // single VPInterleaveRecipe.
7176 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7177 auto applyIG = [IG, this](unsigned VF) -> bool {
7178 return (VF >= 2 && // Query is illegal for VF == 1
7179 CM.getWideningDecision(IG->getInsertPos(), VF) ==
7180 LoopVectorizationCostModel::CM_Interleave);
7181 };
7182 if (!getDecisionAndClampRange(applyIG, Range))
7183 continue;
7184 InterleaveGroups.insert(IG);
7185 for (unsigned i = 0; i < IG->getFactor(); i++)
7186 if (Instruction *Member = IG->getMember(i))
7187 RecipeBuilder.recordRecipeOf(Member);
7188 };
7189
7190 // ---------------------------------------------------------------------------
7191 // Build initial VPlan: Scan the body of the loop in a topological order to
7192 // visit each basic block after having visited its predecessor basic blocks.
7193 // ---------------------------------------------------------------------------
7194
7195 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7196 auto Plan = std::make_unique<VPlan>();
7197 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7198 Plan->setEntry(VPBB);
7199
7200 // Represent values that will have defs inside VPlan.
7201 for (Value *V : NeedDef)
7202 Plan->addVPValue(V);
7203
7204 // Scan the body of the loop in a topological order to visit each basic block
7205 // after having visited its predecessor basic blocks.
7206 LoopBlocksDFS DFS(OrigLoop);
7207 DFS.perform(LI);
7208
7209 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7210 // Relevant instructions from basic block BB will be grouped into VPRecipe
7211 // ingredients and fill a new VPBasicBlock.
7212 unsigned VPBBsForBB = 0;
7213 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7214 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7215 VPBB = FirstVPBBForBB;
7216 Builder.setInsertPoint(VPBB);
7217
7218 // Introduce each ingredient into VPlan.
7219 for (Instruction &I : BB->instructionsWithoutDebug()) {
7220 Instruction *Instr = &I;
7221
7222 // First filter out irrelevant instructions, to ensure no recipes are
7223 // built for them.
7224 if (isa<BranchInst>(Instr) ||
7225 DeadInstructions.find(Instr) != DeadInstructions.end())
7226 continue;
7227
7228 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7229 continue;
7230
7231 // Otherwise, if all widening options failed, Instruction is to be
7232 // replicated. This may create a successor for VPBB.
7233 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7234 Instr, Range, VPBB, PredInst2Recipe, Plan);
7235 if (NextVPBB != VPBB) {
7236 VPBB = NextVPBB;
7237 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7238 : "");
7239 }
7240 }
7241 }
7242
7243 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7244 // may also be empty, such as the last one VPBB, reflecting original
7245 // basic-blocks with no recipes.
7246 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7247 assert(PreEntry->empty() && "Expecting empty pre-entry block.")((PreEntry->empty() && "Expecting empty pre-entry block."
) ? static_cast<void> (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7247, __PRETTY_FUNCTION__))
;
7248 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7249 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7250 delete PreEntry;
7251
7252 // ---------------------------------------------------------------------------
7253 // Transform initial VPlan: Apply previously taken decisions, in order, to
7254 // bring the VPlan to its final state.
7255 // ---------------------------------------------------------------------------
7256
7257 // Apply Sink-After legal constraints.
7258 for (auto &Entry : SinkAfter) {
7259 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7260 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7261 Sink->moveAfter(Target);
7262 }
7263
7264 // Interleave memory: for each Interleave Group we marked earlier as relevant
7265 // for this VPlan, replace the Recipes widening its memory instructions with a
7266 // single VPInterleaveRecipe at its insertion point.
7267 for (auto IG : InterleaveGroups) {
7268 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7269 RecipeBuilder.getRecipe(IG->getInsertPos()));
7270 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7271 ->insertBefore(Recipe);
7272
7273 for (unsigned i = 0; i < IG->getFactor(); ++i)
7274 if (Instruction *Member = IG->getMember(i)) {
7275 RecipeBuilder.getRecipe(Member)->eraseFromParent();
7276 }
7277 }
7278
7279 // Finally, if tail is folded by masking, introduce selects between the phi
7280 // and the live-out instruction of each reduction, at the end of the latch.
7281 if (CM.foldTailByMasking()) {
7282 Builder.setInsertPoint(VPBB);
7283 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7284 for (auto &Reduction : Legal->getReductionVars()) {
7285 VPValue *Phi = Plan->getVPValue(Reduction.first);
7286 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7287 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7288 }
7289 }
7290
7291 std::string PlanName;
7292 raw_string_ostream RSO(PlanName);
7293 unsigned VF = Range.Start;
7294 Plan->addVF(VF);
7295 RSO << "Initial VPlan for VF={" << VF;
7296 for (VF *= 2; VF < Range.End; VF *= 2) {
7297 Plan->addVF(VF);
7298 RSO << "," << VF;
7299 }
7300 RSO << "},UF>=1";
7301 RSO.flush();
7302 Plan->setName(PlanName);
7303
7304 return Plan;
7305}
7306
7307VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7308 // Outer loop handling: They may require CFG and instruction level
7309 // transformations before even evaluating whether vectorization is profitable.
7310 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7311 // the vectorization pipeline.
7312 assert(!OrigLoop->empty())((!OrigLoop->empty()) ? static_cast<void> (0) : __assert_fail
("!OrigLoop->empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7312, __PRETTY_FUNCTION__))
;
7313 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7313, __PRETTY_FUNCTION__))
;
7314
7315 // Create new empty VPlan
7316 auto Plan = std::make_unique<VPlan>();
7317
7318 // Build hierarchical CFG
7319 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7320 HCFGBuilder.buildHierarchicalCFG();
7321
7322 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7323 Plan->addVF(VF);
7324
7325 if (EnableVPlanPredication) {
7326 VPlanPredicator VPP(*Plan);
7327 VPP.predicate();
7328
7329 // Avoid running transformation to recipes until masked code generation in
7330 // VPlan-native path is in place.
7331 return Plan;
7332 }
7333
7334 SmallPtrSet<Instruction *, 1> DeadInstructions;
7335 VPlanTransforms::VPInstructionsToVPRecipes(
7336 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7337 return Plan;
7338}
7339
7340Value* LoopVectorizationPlanner::VPCallbackILV::
7341getOrCreateVectorValues(Value *V, unsigned Part) {
7342 return ILV.getOrCreateVectorValue(V, Part);
7343}
7344
7345Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7346 Value *V, const VPIteration &Instance) {
7347 return ILV.getOrCreateScalarValue(V, Instance);
7348}
7349
7350void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7351 VPSlotTracker &SlotTracker) const {
7352 O << " +\n"
7353 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7354 IG->getInsertPos()->printAsOperand(O, false);
7355 O << ", ";
7356 getAddr()->printAsOperand(O, SlotTracker);
7357 VPValue *Mask = getMask();
7358 if (Mask) {
7359 O << ", ";
7360 Mask->printAsOperand(O, SlotTracker);
7361 }
7362 O << "\\l\"";
7363 for (unsigned i = 0; i < IG->getFactor(); ++i)
7364 if (Instruction *I = IG->getMember(i))
7365 O << " +\n"
7366 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
7367}
7368
7369void VPWidenRecipe::execute(VPTransformState &State) {
7370 for (auto &Instr : make_range(Begin, End))
7371 State.ILV->widenInstruction(Instr);
7372}
7373
7374void VPWidenGEPRecipe::execute(VPTransformState &State) {
7375 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7376 IsIndexLoopInvariant);
7377}
7378
7379void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7380 assert(!State.Instance && "Int or FP induction being replicated.")((!State.Instance && "Int or FP induction being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7380, __PRETTY_FUNCTION__))
;
7381 State.ILV->widenIntOrFpInduction(IV, Trunc);
7382}
7383
7384void VPWidenPHIRecipe::execute(VPTransformState &State) {
7385 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7386}
7387
7388void VPBlendRecipe::execute(VPTransformState &State) {
7389 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7390 // We know that all PHIs in non-header blocks are converted into
7391 // selects, so we don't have to worry about the insertion order and we
7392 // can just use the builder.
7393 // At this point we generate the predication tree. There may be
7394 // duplications since this is a simple recursive scan, but future
7395 // optimizations will clean it up.
7396
7397 unsigned NumIncoming = Phi->getNumIncomingValues();
7398
7399 assert((User || NumIncoming == 1) &&(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7400, __PRETTY_FUNCTION__))
7400 "Multiple predecessors with predecessors having a full mask")(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7400, __PRETTY_FUNCTION__))
;
7401 // Generate a sequence of selects of the form:
7402 // SELECT(Mask3, In3,
7403 // SELECT(Mask2, In2,
7404 // ( ...)))
7405 InnerLoopVectorizer::VectorParts Entry(State.UF);
7406 for (unsigned In = 0; In < NumIncoming; ++In) {
7407 for (unsigned Part = 0; Part < State.UF; ++Part) {
7408 // We might have single edge PHIs (blocks) - use an identity
7409 // 'select' for the first PHI operand.
7410 Value *In0 =
7411 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7412 if (In == 0)
7413 Entry[Part] = In0; // Initialize with the first incoming value.
7414 else {
7415 // Select between the current value and the previous incoming edge
7416 // based on the incoming mask.
7417 Value *Cond = State.get(User->getOperand(In), Part);
7418 Entry[Part] =
7419 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7420 }
7421 }
7422 }
7423 for (unsigned Part = 0; Part < State.UF; ++Part)
7424 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7425}
7426
7427void VPInterleaveRecipe::execute(VPTransformState &State) {
7428 assert(!State.Instance && "Interleave group being replicated.")((!State.Instance && "Interleave group being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7428, __PRETTY_FUNCTION__))
;
7429 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7430 getMask());
7431}
7432
7433void VPReplicateRecipe::execute(VPTransformState &State) {
7434 if (State.Instance) { // Generate a single instance.
7435 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7436 // Insert scalar instance packing it into a vector.
7437 if (AlsoPack && State.VF > 1) {
7438 // If we're constructing lane 0, initialize to start from undef.
7439 if (State.Instance->Lane == 0) {
7440 Value *Undef =
7441 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7442 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7443 }
7444 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7445 }
7446 return;
7447 }
7448
7449 // Generate scalar instances for all VF lanes of all UF parts, unless the
7450 // instruction is uniform inwhich case generate only the first lane for each
7451 // of the UF parts.
7452 unsigned EndLane = IsUniform ? 1 : State.VF;
7453 for (unsigned Part = 0; Part < State.UF; ++Part)
7454 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7455 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7456}
7457
7458void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7459 assert(State.Instance && "Branch on Mask works only on single instance.")((State.Instance && "Branch on Mask works only on single instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7459, __PRETTY_FUNCTION__))
;
7460
7461 unsigned Part = State.Instance->Part;
7462 unsigned Lane = State.Instance->Lane;
7463
7464 Value *ConditionBit = nullptr;
7465 if (!User) // Block in mask is all-one.
7466 ConditionBit = State.Builder.getTrue();
7467 else {
7468 VPValue *BlockInMask = User->getOperand(0);
7469 ConditionBit = State.get(BlockInMask, Part);
7470 if (ConditionBit->getType()->isVectorTy())
7471 ConditionBit = State.Builder.CreateExtractElement(
7472 ConditionBit, State.Builder.getInt32(Lane));
7473 }
7474
7475 // Replace the temporary unreachable terminator with a new conditional branch,
7476 // whose two destinations will be set later when they are created.
7477 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7478 assert(isa<UnreachableInst>(CurrentTerminator) &&((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7479, __PRETTY_FUNCTION__))
7479 "Expected to replace unreachable terminator with conditional branch.")((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7479, __PRETTY_FUNCTION__))
;
7480 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7481 CondBr->setSuccessor(0, nullptr);
7482 ReplaceInstWithInst(CurrentTerminator, CondBr);
7483}
7484
7485void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7486 assert(State.Instance && "Predicated instruction PHI works per instance.")((State.Instance && "Predicated instruction PHI works per instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7486, __PRETTY_FUNCTION__))
;
7487 Instruction *ScalarPredInst = cast<Instruction>(
7488 State.ValueMap.getScalarValue(PredInst, *State.Instance));
7489 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7490 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7491 assert(PredicatingBB && "Predicated block has no single predecessor.")((PredicatingBB && "Predicated block has no single predecessor."
) ? static_cast<void> (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7491, __PRETTY_FUNCTION__))
;
7492
7493 // By current pack/unpack logic we need to generate only a single phi node: if
7494 // a vector value for the predicated instruction exists at this point it means
7495 // the instruction has vector users only, and a phi for the vector value is
7496 // needed. In this case the recipe of the predicated instruction is marked to
7497 // also do that packing, thereby "hoisting" the insert-element sequence.
7498 // Otherwise, a phi node for the scalar value is needed.
7499 unsigned Part = State.Instance->Part;
7500 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7501 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7502 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7503 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7504 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7505 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7506 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7507 } else {
7508 Type *PredInstType = PredInst->getType();
7509 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7510 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7511 Phi->addIncoming(ScalarPredInst, PredicatedBB);
7512 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7513 }
7514}
7515
7516void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7517 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7518}
7519
7520// Determine how to lower the scalar epilogue, which depends on 1) optimising
7521// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7522// predication, and 4) a TTI hook that analyses whether the loop is suitable
7523// for predication.
7524static ScalarEpilogueLowering getScalarEpilogueLowering(
7525 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7526 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7527 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7528 LoopVectorizationLegality &LVL) {
7529 bool OptSize =
7530 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7531 PGSOQueryType::IRPass);
7532 // 1) OptSize takes precedence over all other options, i.e. if this is set,
7533 // don't look at hints or options, and don't request a scalar epilogue.
7534 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7535 return CM_ScalarEpilogueNotAllowedOptSize;
7536
7537 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7538 !PreferPredicateOverEpilog;
7539
7540 // 2) Next, if disabling predication is requested on the command line, honour
7541 // this and request a scalar epilogue. Also do this if we don't have a
7542 // primary induction variable, which is required for predication.
7543 if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7544 return CM_ScalarEpilogueAllowed;
7545
7546 // 3) and 4) look if enabling predication is requested on the command line,
7547 // with a loop hint, or if the TTI hook indicates this is profitable, request
7548 // predication .
7549 if (PreferPredicateOverEpilog ||
7550 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7551 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7552 LVL.getLAI()) &&
7553 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7554 return CM_ScalarEpilogueNotNeededUsePredicate;
7555
7556 return CM_ScalarEpilogueAllowed;
7557}
7558
7559// Process the loop in the VPlan-native vectorization path. This path builds
7560// VPlan upfront in the vectorization pipeline, which allows to apply
7561// VPlan-to-VPlan transformations from the very beginning without modifying the
7562// input LLVM IR.
7563static bool processLoopInVPlanNativePath(
7564 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7565 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7566 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7567 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7568 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7569
7570 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")((EnableVPlanNativePath && "VPlan-native path is disabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7570, __PRETTY_FUNCTION__))
;
7571 Function *F = L->getHeader()->getParent();
7572 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7573
7574 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7575 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7576
7577 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7578 &Hints, IAI);
7579 // Use the planner for outer loop vectorization.
7580 // TODO: CM is not used at this point inside the planner. Turn CM into an
7581 // optional argument if we don't need it in the future.
7582 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7583
7584 // Get user vectorization factor.
7585 const unsigned UserVF = Hints.getWidth();
7586
7587 // Plan how to best vectorize, return the best VF and its cost.
7588 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7589
7590 // If we are stress testing VPlan builds, do not attempt to generate vector
7591 // code. Masked vector code generation support will follow soon.
7592 // Also, do not attempt to vectorize if no vector code will be produced.
7593 if (VPlanBuildStressTest || EnableVPlanPredication ||
7594 VectorizationFactor::Disabled() == VF)
7595 return false;
7596
7597 LVP.setBestPlan(VF.Width, 1);
7598
7599 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7600 &CM);
7601 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
7602 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
7603 LVP.executePlan(LB, DT);
7604
7605 // Mark the loop as already vectorized to avoid vectorizing again.
7606 Hints.setAlreadyVectorized();
7607
7608 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false)
;
7609 return true;
7610}
7611
7612bool LoopVectorizePass::processLoop(Loop *L) {
7613 assert((EnableVPlanNativePath || L->empty()) &&(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7614, __PRETTY_FUNCTION__))
7614 "VPlan-native path is not enabled. Only process inner loops.")(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7614, __PRETTY_FUNCTION__))
;
7615
7616#ifndef NDEBUG
7617 const std::string DebugLocStr = getDebugLocString(L);
7618#endif /* NDEBUG */
7619
7620 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
7621 << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
7622 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
;
7623
7624 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7625
7626 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7627 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7628 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7629 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7630 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7631 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7632 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7633 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7634 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
7635 << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
;
7636
7637 // Function containing loop
7638 Function *F = L->getHeader()->getParent();
7639
7640 // Looking at the diagnostic output is the only way to determine if a loop
7641 // was vectorized (other than looking at the IR or machine code), so it
7642 // is important to generate an optimization remark for each loop. Most of
7643 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7644 // generated as OptimizationRemark and OptimizationRemarkMissed are
7645 // less verbose reporting vectorized loops and unvectorized loops that may
7646 // benefit from vectorization, respectively.
7647
7648 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7649 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
7650 return false;
7651 }
7652
7653 PredicatedScalarEvolution PSE(*SE, *L);
7654
7655 // Check if it is legal to vectorize the loop.
7656 LoopVectorizationRequirements Requirements(*ORE);
7657 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7658 &Requirements, &Hints, DB, AC);
7659 if (!LVL.canVectorize(EnableVPlanNativePath)) {
7660 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
7661 Hints.emitRemarkWithHints();
7662 return false;
7663 }
7664
7665 // Check the function attributes and profiles to find out if this function
7666 // should be optimized for size.
7667 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7668 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7669
7670 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7671 // here. They may require CFG and instruction level transformations before
7672 // even evaluating whether vectorization is profitable. Since we cannot modify
7673 // the incoming IR, we need to build VPlan upfront in the vectorization
7674 // pipeline.
7675 if (!L->empty())
7676 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7677 ORE, BFI, PSI, Hints);
7678
7679 assert(L->empty() && "Inner loop expected.")((L->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("L->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7679, __PRETTY_FUNCTION__))
;
7680
7681 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7682 // count by optimizing for size, to minimize overheads.
7683 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7684 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7685 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
7686 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
7687 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
7688 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7689 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
7690 else {
7691 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
7692 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7693 }
7694 }
7695
7696 // Check the function attributes to see if implicit floats are allowed.
7697 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7698 // an integer loop and the vector instructions selected are purely integer
7699 // vector instructions?
7700 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7701 reportVectorizationFailure(
7702 "Can't vectorize when the NoImplicitFloat attribute is used",
7703 "loop not vectorized due to NoImplicitFloat attribute",
7704 "NoImplicitFloat", ORE, L);
7705 Hints.emitRemarkWithHints();
7706 return false;
7707 }
7708
7709 // Check if the target supports potentially unsafe FP vectorization.
7710 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7711 // for the target we're vectorizing for, to make sure none of the
7712 // additional fp-math flags can help.
7713 if (Hints.isPotentiallyUnsafe() &&
7714 TTI->isFPVectorizationPotentiallyUnsafe()) {
7715 reportVectorizationFailure(
7716 "Potentially unsafe FP op prevents vectorization",
7717 "loop not vectorized due to unsafe FP support.",
7718 "UnsafeFP", ORE, L);
7719 Hints.emitRemarkWithHints();
7720 return false;
7721 }
7722
7723 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7724 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7725
7726 // If an override option has been passed in for interleaved accesses, use it.
7727 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7728 UseInterleaved = EnableInterleavedMemAccesses;
7729
7730 // Analyze interleaved memory accesses.
7731 if (UseInterleaved) {
7732 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7733 }
7734
7735 // Use the cost model.
7736 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7737 F, &Hints, IAI);
7738 CM.collectValuesToIgnore();
7739
7740 // Use the planner for vectorization.
7741 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7742
7743 // Get user vectorization factor.
7744 unsigned UserVF = Hints.getWidth();
7745
7746 // Plan how to best vectorize, return the best VF and its cost.
7747 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7748
7749 VectorizationFactor VF = VectorizationFactor::Disabled();
7750 unsigned IC = 1;
7751 unsigned UserIC = Hints.getInterleave();
7752
7753 if (MaybeVF) {
7754 VF = *MaybeVF;
7755 // Select the interleave count.
7756 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7757 }
7758
7759 // Identify the diagnostic messages that should be produced.
7760 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7761 bool VectorizeLoop = true, InterleaveLoop = true;
7762 if (Requirements.doesNotMeet(F, L, Hints)) {
7763 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
7764 "requirements.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n"; } } while (false)
;
7765 Hints.emitRemarkWithHints();
7766 return false;
7767 }
7768
7769 if (VF.Width == 1) {
7770 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
7771 VecDiagMsg = std::make_pair(
7772 "VectorizationNotBeneficial",
7773 "the cost-model indicates that vectorization is not beneficial");
7774 VectorizeLoop = false;
7775 }
7776
7777 if (!MaybeVF && UserIC > 1) {
7778 // Tell the user interleaving was avoided up-front, despite being explicitly
7779 // requested.
7780 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
7781 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
7782 IntDiagMsg = std::make_pair(
7783 "InterleavingAvoided",
7784 "Ignoring UserIC, because interleaving was avoided up front");
7785 InterleaveLoop = false;
7786 } else if (IC == 1 && UserIC <= 1) {
7787 // Tell the user interleaving is not beneficial.
7788 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
7789 IntDiagMsg = std::make_pair(
7790 "InterleavingNotBeneficial",
7791 "the cost-model indicates that interleaving is not beneficial");
7792 InterleaveLoop = false;
7793 if (UserIC == 1) {
7794 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7795 IntDiagMsg.second +=
7796 " and is explicitly disabled or interleave count is set to 1";
7797 }
7798 } else if (IC > 1 && UserIC == 1) {
7799 // Tell the user interleaving is beneficial, but it explicitly disabled.
7800 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
7801 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
7802 IntDiagMsg = std::make_pair(
7803 "InterleavingBeneficialButDisabled",
7804 "the cost-model indicates that interleaving is beneficial "
7805 "but is explicitly disabled or interleave count is set to 1");
7806 InterleaveLoop = false;
7807 }
7808
7809 // Override IC if user provided an interleave count.
7810 IC = UserIC > 0 ? UserIC : IC;
7811
7812 // Emit diagnostic messages, if any.
7813 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7814 if (!VectorizeLoop && !InterleaveLoop) {
7815 // Do not vectorize or interleaving the loop.
7816 ORE->emit([&]() {
7817 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7818 L->getStartLoc(), L->getHeader())
7819 << VecDiagMsg.second;
7820 });
7821 ORE->emit([&]() {
7822 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
7823 L->getStartLoc(), L->getHeader())
7824 << IntDiagMsg.second;
7825 });
7826 return false;
7827 } else if (!VectorizeLoop && InterleaveLoop) {
7828 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
7829 ORE->emit([&]() {
7830 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7831 L->getStartLoc(), L->getHeader())
7832 << VecDiagMsg.second;
7833 });
7834 } else if (VectorizeLoop && !InterleaveLoop) {
7835 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
7836 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
7837 ORE->emit([&]() {
7838 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
7839 L->getStartLoc(), L->getHeader())
7840 << IntDiagMsg.second;
7841 });
7842 } else if (VectorizeLoop && InterleaveLoop) {
7843 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
7844 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
7845 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
7846 }
7847
7848 LVP.setBestPlan(VF.Width, IC);
7849
7850 using namespace ore;
7851 bool DisableRuntimeUnroll = false;
7852 MDNode *OrigLoopID = L->getLoopID();
7853
7854 if (!VectorizeLoop) {
7855 assert(IC > 1 && "interleave count should not be 1 or 0")((IC > 1 && "interleave count should not be 1 or 0"
) ? static_cast<void> (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7855, __PRETTY_FUNCTION__))
;
7856 // If we decided that it is not legal to vectorize the loop, then
7857 // interleave it.
7858 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7859 &CM);
7860 LVP.executePlan(Unroller, DT);
7861
7862 ORE->emit([&]() {
7863 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
7864 L->getHeader())
7865 << "interleaved loop (interleaved count: "
7866 << NV("InterleaveCount", IC) << ")";
7867 });
7868 } else {
7869 // If we decided that it is *legal* to vectorize the loop, then do it.
7870 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7871 &LVL, &CM);
7872 LVP.executePlan(LB, DT);
7873 ++LoopsVectorized;
7874
7875 // Add metadata to disable runtime unrolling a scalar loop when there are
7876 // no runtime checks about strides and memory. A scalar loop that is
7877 // rarely used is not worth unrolling.
7878 if (!LB.areSafetyChecksAdded())
7879 DisableRuntimeUnroll = true;
7880
7881 // Report the vectorization decision.
7882 ORE->emit([&]() {
7883 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
7884 L->getHeader())
7885 << "vectorized loop (vectorization width: "
7886 << NV("VectorizationFactor", VF.Width)
7887 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7888 });
7889 }
7890
7891 Optional<MDNode *> RemainderLoopID =
7892 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7893 LLVMLoopVectorizeFollowupEpilogue});
7894 if (RemainderLoopID.hasValue()) {
7895 L->setLoopID(RemainderLoopID.getValue());
7896 } else {
7897 if (DisableRuntimeUnroll)
7898 AddRuntimeUnrollDisableMetaData(L);
7899
7900 // Mark the loop as already vectorized to avoid vectorizing again.
7901 Hints.setAlreadyVectorized();
7902 }
7903
7904 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false)
;
7905 return true;
7906}
7907
7908bool LoopVectorizePass::runImpl(
7909 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7910 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7911 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7912 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7913 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7914 SE = &SE_;
7915 LI = &LI_;
7916 TTI = &TTI_;
7917 DT = &DT_;
7918 BFI = &BFI_;
7919 TLI = TLI_;
7920 AA = &AA_;
7921 AC = &AC_;
7922 GetLAA = &GetLAA_;
7923 DB = &DB_;
7924 ORE = &ORE_;
7925 PSI = PSI_;
7926
7927 // Don't attempt if
7928 // 1. the target claims to have no vector registers, and
7929 // 2. interleaving won't help ILP.
7930 //
7931 // The second condition is necessary because, even if the target has no
7932 // vector registers, loop vectorization may still enable scalar
7933 // interleaving.
7934 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7935 TTI->getMaxInterleaveFactor(1) < 2)
7936 return false;
7937
7938 bool Changed = false;
7939
7940 // The vectorizer requires loops to be in simplified form.
7941 // Since simplification may add new inner loops, it has to run before the
7942 // legality and profitability checks. This means running the loop vectorizer
7943 // will simplify all loops, regardless of whether anything end up being
7944 // vectorized.
7945 for (auto &L : *LI)
7946 Changed |=
7947 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7948
7949 // Build up a worklist of inner-loops to vectorize. This is necessary as
7950 // the act of vectorizing or partially unrolling a loop creates new loops
7951 // and can invalidate iterators across the loops.
7952 SmallVector<Loop *, 8> Worklist;
7953
7954 for (Loop *L : *LI)
7955 collectSupportedLoops(*L, LI, ORE, Worklist);
7956
7957 LoopsAnalyzed += Worklist.size();
7958
7959 // Now walk the identified inner loops.
7960 while (!Worklist.empty()) {
7961 Loop *L = Worklist.pop_back_val();
7962
7963 // For the inner loops we actually process, form LCSSA to simplify the
7964 // transform.
7965 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7966
7967 Changed |= processLoop(L);
7968 }
7969
7970 // Process each loop nest in the function.
7971 return Changed;
7972}
7973
7974PreservedAnalyses LoopVectorizePass::run(Function &F,
7975 FunctionAnalysisManager &AM) {
7976 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7977 auto &LI = AM.getResult<LoopAnalysis>(F);
7978 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7979 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7980 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7981 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7982 auto &AA = AM.getResult<AAManager>(F);
7983 auto &AC = AM.getResult<AssumptionAnalysis>(F);
7984 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7985 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7986 MemorySSA *MSSA = EnableMSSALoopDependency
7987 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7988 : nullptr;
7989
7990 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7991 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7992 [&](Loop &L) -> const LoopAccessInfo & {
7993 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7994 return LAM.getResult<LoopAccessAnalysis>(L, AR);
7995 };
7996 const ModuleAnalysisManager &MAM =
7997 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7998 ProfileSummaryInfo *PSI =
7999 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8000 bool Changed =
8001 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8002 if (!Changed)
8003 return PreservedAnalyses::all();
8004 PreservedAnalyses PA;
8005
8006 // We currently do not preserve loopinfo/dominator analyses with outer loop
8007 // vectorization. Until this is addressed, mark these analyses as preserved
8008 // only for non-VPlan-native path.
8009 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8010 if (!EnableVPlanNativePath) {
8011 PA.preserve<LoopAnalysis>();
8012 PA.preserve<DominatorTreeAnalysis>();
8013 }
8014 PA.preserve<BasicAA>();
8015 PA.preserve<GlobalsAA>();
8016 return PA;
8017}

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides a LoopVectorizationPlanner class.
11/// InnerLoopVectorizer vectorizes loops which contain only one basic
12/// LoopVectorizationPlanner - drives the vectorization process after having
13/// passed Legality checks.
14/// The planner builds and optimizes the Vectorization Plans which record the
15/// decisions how to vectorize the given loop. In particular, represent the
16/// control-flow of the vectorized version, the replication of instructions that
17/// are to be scalarized, and interleave access groups.
18///
19/// Also provides a VPlan-based builder utility analogous to IRBuilder.
20/// It provides an instruction-level API for generating VPInstructions while
21/// abstracting away the Recipe manipulation details.
22//===----------------------------------------------------------------------===//
23
24#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
25#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
26
27#include "VPlan.h"
28#include "llvm/Analysis/LoopInfo.h"
29#include "llvm/Analysis/TargetLibraryInfo.h"
30#include "llvm/Analysis/TargetTransformInfo.h"
31
32namespace llvm {
33
34/// VPlan-based builder utility analogous to IRBuilder.
35class VPBuilder {
36private:
37 VPBasicBlock *BB = nullptr;
38 VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
39
40 VPInstruction *createInstruction(unsigned Opcode,
41 ArrayRef<VPValue *> Operands) {
42 VPInstruction *Instr = new VPInstruction(Opcode, Operands);
22
Memory is allocated
43 if (BB)
23
Assuming field 'BB' is null
24
Taking false branch
44 BB->insert(Instr, InsertPt);
45 return Instr;
46 }
47
48 VPInstruction *createInstruction(unsigned Opcode,
49 std::initializer_list<VPValue *> Operands) {
50 return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
21
Calling 'VPBuilder::createInstruction'
25
Returned allocated memory
51 }
52
53public:
54 VPBuilder() {}
55
56 /// Clear the insertion point: created instructions will not be inserted into
57 /// a block.
58 void clearInsertionPoint() {
59 BB = nullptr;
60 InsertPt = VPBasicBlock::iterator();
61 }
62
63 VPBasicBlock *getInsertBlock() const { return BB; }
64 VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
65
66 /// InsertPoint - A saved insertion point.
67 class VPInsertPoint {
68 VPBasicBlock *Block = nullptr;
69 VPBasicBlock::iterator Point;
70
71 public:
72 /// Creates a new insertion point which doesn't point to anything.
73 VPInsertPoint() = default;
74
75 /// Creates a new insertion point at the given location.
76 VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
77 : Block(InsertBlock), Point(InsertPoint) {}
78
79 /// Returns true if this insert point is set.
80 bool isSet() const { return Block != nullptr; }
81
82 VPBasicBlock *getBlock() const { return Block; }
83 VPBasicBlock::iterator getPoint() const { return Point; }
84 };
85
86 /// Sets the current insert point to a previously-saved location.
87 void restoreIP(VPInsertPoint IP) {
88 if (IP.isSet())
89 setInsertPoint(IP.getBlock(), IP.getPoint());
90 else
91 clearInsertionPoint();
92 }
93
94 /// This specifies that created VPInstructions should be appended to the end
95 /// of the specified block.
96 void setInsertPoint(VPBasicBlock *TheBB) {
97 assert(TheBB && "Attempting to set a null insert point")((TheBB && "Attempting to set a null insert point") ?
static_cast<void> (0) : __assert_fail ("TheBB && \"Attempting to set a null insert point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h"
, 97, __PRETTY_FUNCTION__))
;
98 BB = TheBB;
99 InsertPt = BB->end();
100 }
101
102 /// This specifies that created instructions should be inserted at the
103 /// specified point.
104 void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
105 BB = TheBB;
106 InsertPt = IP;
107 }
108
109 /// Insert and return the specified instruction.
110 VPInstruction *insert(VPInstruction *I) const {
111 BB->insert(I, InsertPt);
112 return I;
113 }
114
115 /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
116 /// its underlying Instruction.
117 VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
118 Instruction *Inst = nullptr) {
119 VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
120 NewVPInst->setUnderlyingValue(Inst);
121 return NewVPInst;
122 }
123 VPValue *createNaryOp(unsigned Opcode,
124 std::initializer_list<VPValue *> Operands,
125 Instruction *Inst = nullptr) {
126 return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
127 }
128
129 VPValue *createNot(VPValue *Operand) {
130 return createInstruction(VPInstruction::Not, {Operand});
131 }
132
133 VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
134 return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
135 }
136
137 VPValue *createOr(VPValue *LHS, VPValue *RHS) {
138 return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
20
Calling 'VPBuilder::createInstruction'
26
Returned allocated memory
139 }
140
141 //===--------------------------------------------------------------------===//
142 // RAII helpers.
143 //===--------------------------------------------------------------------===//
144
145 /// RAII object that stores the current insertion point and restores it when
146 /// the object is destroyed.
147 class InsertPointGuard {
148 VPBuilder &Builder;
149 VPBasicBlock *Block;
150 VPBasicBlock::iterator Point;
151
152 public:
153 InsertPointGuard(VPBuilder &B)
154 : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
155
156 InsertPointGuard(const InsertPointGuard &) = delete;
157 InsertPointGuard &operator=(const InsertPointGuard &) = delete;
158
159 ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
160 };
161};
162
163/// TODO: The following VectorizationFactor was pulled out of
164/// LoopVectorizationCostModel class. LV also deals with
165/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
166/// We need to streamline them.
167
168/// Information about vectorization costs
169struct VectorizationFactor {
170 // Vector width with best cost
171 unsigned Width;
172 // Cost of the loop with that width
173 unsigned Cost;
174
175 // Width 1 means no vectorization, cost 0 means uncomputed cost.
176 static VectorizationFactor Disabled() { return {1, 0}; }
177
178 bool operator==(const VectorizationFactor &rhs) const {
179 return Width == rhs.Width && Cost == rhs.Cost;
180 }
181};
182
183/// Planner drives the vectorization process after having passed
184/// Legality checks.
185class LoopVectorizationPlanner {
186 /// The loop that we evaluate.
187 Loop *OrigLoop;
188
189 /// Loop Info analysis.
190 LoopInfo *LI;
191
192 /// Target Library Info.
193 const TargetLibraryInfo *TLI;
194
195 /// Target Transform Info.
196 const TargetTransformInfo *TTI;
197
198 /// The legality analysis.
199 LoopVectorizationLegality *Legal;
200
201 /// The profitability analysis.
202 LoopVectorizationCostModel &CM;
203
204 /// The interleaved access analysis.
205 InterleavedAccessInfo &IAI;
206
207 SmallVector<VPlanPtr, 4> VPlans;
208
209 /// This class is used to enable the VPlan to invoke a method of ILV. This is
210 /// needed until the method is refactored out of ILV and becomes reusable.
211 struct VPCallbackILV : public VPCallback {
212 InnerLoopVectorizer &ILV;
213
214 VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
215
216 Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
217 Value *getOrCreateScalarValue(Value *V,
218 const VPIteration &Instance) override;
219 };
220
221 /// A builder used to construct the current plan.
222 VPBuilder Builder;
223
224 unsigned BestVF = 0;
225 unsigned BestUF = 0;
226
227public:
228 LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
229 const TargetTransformInfo *TTI,
230 LoopVectorizationLegality *Legal,
231 LoopVectorizationCostModel &CM,
232 InterleavedAccessInfo &IAI)
233 : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
234 IAI(IAI) {}
235
236 /// Plan how to best vectorize, return the best VF and its cost, or None if
237 /// vectorization and interleaving should be avoided up front.
238 Optional<VectorizationFactor> plan(unsigned UserVF);
239
240 /// Use the VPlan-native path to plan how to best vectorize, return the best
241 /// VF and its cost.
242 VectorizationFactor planInVPlanNativePath(unsigned UserVF);
243
244 /// Finalize the best decision and dispose of all other VPlans.
245 void setBestPlan(unsigned VF, unsigned UF);
246
247 /// Generate the IR code for the body of the vectorized loop according to the
248 /// best selected VPlan.
249 void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
250
251 void printPlans(raw_ostream &O) {
252 for (const auto &Plan : VPlans)
253 O << *Plan;
254 }
255
256 /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
257 /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
258 /// returned value holds for the entire \p Range.
259 static bool
260 getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
261 VFRange &Range);
262
263protected:
264 /// Collect the instructions from the original loop that would be trivially
265 /// dead in the vectorized loop if generated.
266 void collectTriviallyDeadInstructions(
267 SmallPtrSetImpl<Instruction *> &DeadInstructions);
268
269 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
270 /// according to the information gathered by Legal when it checked if it is
271 /// legal to vectorize the loop.
272 void buildVPlans(unsigned MinVF, unsigned MaxVF);
273
274private:
275 /// Build a VPlan according to the information gathered by Legal. \return a
276 /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
277 /// exclusive, possibly decreasing \p Range.End.
278 VPlanPtr buildVPlan(VFRange &Range);
279
280 /// Build a VPlan using VPRecipes according to the information gather by
281 /// Legal. This method is only used for the legacy inner loop vectorizer.
282 VPlanPtr buildVPlanWithVPRecipes(
283 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
284 SmallPtrSetImpl<Instruction *> &DeadInstructions,
285 const DenseMap<Instruction *, Instruction *> &SinkAfter);
286
287 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
288 /// according to the information gathered by Legal when it checked if it is
289 /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
290 void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
291};
292
293} // namespace llvm
294
295#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H