/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:	line 2365, column 9 Value stored to 'InitVecValSTy' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/build-llvm -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-02-07-163825-15572-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220207111159+b8804557686f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10	// and generates target-independent LLVM-IR.
11	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12	// of instructions in order to estimate the profitability of vectorization.
13	//
14	// The loop vectorizer combines consecutive loop iterations into a single
15	// 'wide' iteration. After this transformation the index is incremented
16	// by the SIMD vector width, and not by one.
17	//
18	// This pass has three parts:
19	// 1. The main loop pass that drives the different parts.
20	// 2. LoopVectorizationLegality - A unit that checks for the legality
21	// of the vectorization.
22	// 3. InnerLoopVectorizer - A unit that performs the actual
23	// widening of instructions.
24	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25	// of vectorization. It decides on the optimal vector width, which
26	// can be one, if vectorization is not profitable.
27	//
28	// There is a development effort going on to migrate loop vectorizer to the
29	// VPlan infrastructure and to introduce outer loop vectorization support (see
30	// docs/Proposal/VectorizationPlan.rst and
31	// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32	// purpose, we temporarily introduced the VPlan-native vectorization path: an
33	// alternative vectorization path that is natively implemented on top of the
34	// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35	//
36	//===----------------------------------------------------------------------===//
37	//
38	// The reduction-variable vectorization is based on the paper:
39	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40	//
41	// Variable uniformity checks are inspired by:
42	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43	//
44	// The interleaved access vectorization is based on the paper:
45	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46	// Data for SIMD
47	//
48	// Other ideas/concepts are from:
49	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50	//
51	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52	// Vectorizing Compilers.
53	//
54	//===----------------------------------------------------------------------===//
55
56	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57	#include "LoopVectorizationPlanner.h"
58	#include "VPRecipeBuilder.h"
59	#include "VPlan.h"
60	#include "VPlanHCFGBuilder.h"
61	#include "VPlanPredicator.h"
62	#include "VPlanTransforms.h"
63	#include "llvm/ADT/APInt.h"
64	#include "llvm/ADT/ArrayRef.h"
65	#include "llvm/ADT/DenseMap.h"
66	#include "llvm/ADT/DenseMapInfo.h"
67	#include "llvm/ADT/Hashing.h"
68	#include "llvm/ADT/MapVector.h"
69	#include "llvm/ADT/None.h"
70	#include "llvm/ADT/Optional.h"
71	#include "llvm/ADT/STLExtras.h"
72	#include "llvm/ADT/SmallPtrSet.h"
73	#include "llvm/ADT/SmallSet.h"
74	#include "llvm/ADT/SmallVector.h"
75	#include "llvm/ADT/Statistic.h"
76	#include "llvm/ADT/StringRef.h"
77	#include "llvm/ADT/Twine.h"
78	#include "llvm/ADT/iterator_range.h"
79	#include "llvm/Analysis/AssumptionCache.h"
80	#include "llvm/Analysis/BasicAliasAnalysis.h"
81	#include "llvm/Analysis/BlockFrequencyInfo.h"
82	#include "llvm/Analysis/CFG.h"
83	#include "llvm/Analysis/CodeMetrics.h"
84	#include "llvm/Analysis/DemandedBits.h"
85	#include "llvm/Analysis/GlobalsModRef.h"
86	#include "llvm/Analysis/LoopAccessAnalysis.h"
87	#include "llvm/Analysis/LoopAnalysisManager.h"
88	#include "llvm/Analysis/LoopInfo.h"
89	#include "llvm/Analysis/LoopIterator.h"
90	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91	#include "llvm/Analysis/ProfileSummaryInfo.h"
92	#include "llvm/Analysis/ScalarEvolution.h"
93	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94	#include "llvm/Analysis/TargetLibraryInfo.h"
95	#include "llvm/Analysis/TargetTransformInfo.h"
96	#include "llvm/Analysis/VectorUtils.h"
97	#include "llvm/IR/Attributes.h"
98	#include "llvm/IR/BasicBlock.h"
99	#include "llvm/IR/CFG.h"
100	#include "llvm/IR/Constant.h"
101	#include "llvm/IR/Constants.h"
102	#include "llvm/IR/DataLayout.h"
103	#include "llvm/IR/DebugInfoMetadata.h"
104	#include "llvm/IR/DebugLoc.h"
105	#include "llvm/IR/DerivedTypes.h"
106	#include "llvm/IR/DiagnosticInfo.h"
107	#include "llvm/IR/Dominators.h"
108	#include "llvm/IR/Function.h"
109	#include "llvm/IR/IRBuilder.h"
110	#include "llvm/IR/InstrTypes.h"
111	#include "llvm/IR/Instruction.h"
112	#include "llvm/IR/Instructions.h"
113	#include "llvm/IR/IntrinsicInst.h"
114	#include "llvm/IR/Intrinsics.h"
115	#include "llvm/IR/LLVMContext.h"
116	#include "llvm/IR/Metadata.h"
117	#include "llvm/IR/Module.h"
118	#include "llvm/IR/Operator.h"
119	#include "llvm/IR/PatternMatch.h"
120	#include "llvm/IR/Type.h"
121	#include "llvm/IR/Use.h"
122	#include "llvm/IR/User.h"
123	#include "llvm/IR/Value.h"
124	#include "llvm/IR/ValueHandle.h"
125	#include "llvm/IR/Verifier.h"
126	#include "llvm/InitializePasses.h"
127	#include "llvm/Pass.h"
128	#include "llvm/Support/Casting.h"
129	#include "llvm/Support/CommandLine.h"
130	#include "llvm/Support/Compiler.h"
131	#include "llvm/Support/Debug.h"
132	#include "llvm/Support/ErrorHandling.h"
133	#include "llvm/Support/InstructionCost.h"
134	#include "llvm/Support/MathExtras.h"
135	#include "llvm/Support/raw_ostream.h"
136	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138	#include "llvm/Transforms/Utils/LoopSimplify.h"
139	#include "llvm/Transforms/Utils/LoopUtils.h"
140	#include "llvm/Transforms/Utils/LoopVersioning.h"
141	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142	#include "llvm/Transforms/Utils/SizeOpts.h"
143	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144	#include <algorithm>
145	#include <cassert>
146	#include <cstdint>
147	#include <cstdlib>
148	#include <functional>
149	#include <iterator>
150	#include <limits>
151	#include <memory>
152	#include <string>
153	#include <tuple>
154	#include <utility>
155
156	using namespace llvm;
157
158	#define LV_NAME"loop-vectorize" "loop-vectorize"
159	#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
160
161	#ifndef NDEBUG
162	const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
163	#endif
164
165	/// @{
166	/// Metadata attribute names
167	const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168	const char LLVMLoopVectorizeFollowupVectorized[] =
169	"llvm.loop.vectorize.followup_vectorized";
170	const char LLVMLoopVectorizeFollowupEpilogue[] =
171	"llvm.loop.vectorize.followup_epilogue";
172	/// @}
173
174	STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized" , "Number of loops vectorized"};
175	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed" , "Number of loops analyzed for vectorization"};
176	STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize" , "LoopsEpilogueVectorized", "Number of epilogues vectorized" };
177
178	static cl::opt<bool> EnableEpilogueVectorization(
179	"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180	cl::desc("Enable vectorization of epilogue loops."));
181
182	static cl::opt<unsigned> EpilogueVectorizationForceVF(
183	"epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184	cl::desc("When epilogue vectorization is enabled, and a value greater than "
185	"1 is specified, forces the given VF for all applicable epilogue "
186	"loops."));
187
188	static cl::opt<unsigned> EpilogueVectorizationMinVF(
189	"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190	cl::desc("Only loops with vectorization factor equal to or larger than "
191	"the specified value are considered for epilogue vectorization."));
192
193	/// Loops with a known constant trip count below this number are vectorized only
194	/// if no scalar iteration overheads are incurred.
195	static cl::opt<unsigned> TinyTripCountVectorThreshold(
196	"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197	cl::desc("Loops with a constant trip count that is smaller than this "
198	"value are vectorized only if no scalar iteration overheads "
199	"are incurred."));
200
201	static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202	"pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203	cl::desc("The maximum allowed number of runtime memory checks with a "
204	"vectorize(enable) pragma."));
205
206	// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207	// that predication is preferred, and this lists all options. I.e., the
208	// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209	// and predicate the instructions accordingly. If tail-folding fails, there are
210	// different fallback strategies depending on these values:
211	namespace PreferPredicateTy {
212	enum Option {
213	ScalarEpilogue = 0,
214	PredicateElseScalarEpilogue,
215	PredicateOrDontVectorize
216	};
217	} // namespace PreferPredicateTy
218
219	static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220	"prefer-predicate-over-epilogue",
221	cl::init(PreferPredicateTy::ScalarEpilogue),
222	cl::Hidden,
223	cl::desc("Tail-folding and predication preferences over creating a scalar "
224	"epilogue loop."),
225	cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy ::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue" }
226	"scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy ::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue" }
227	"Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy ::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue" },
228	clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue", int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail " "folding fails." }
229	"predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue", int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail " "folding fails." }
230	"prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue", int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail " "folding fails." }
231	"folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue", int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail " "folding fails." },
232	clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy ::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if " "tail-folding fails." }
233	"predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy ::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if " "tail-folding fails." }
234	"prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy ::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if " "tail-folding fails." }
235	"tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy ::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if " "tail-folding fails." }));
236
237	static cl::opt<bool> MaximizeBandwidth(
238	"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239	cl::desc("Maximize bandwidth when selecting vectorization factor which "
240	"will be determined by the smallest type in loop."));
241
242	static cl::opt<bool> EnableInterleavedMemAccesses(
243	"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244	cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245
246	/// An interleave-group may need masking if it resides in a block that needs
247	/// predication, or in order to mask away gaps.
248	static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249	"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250	cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251
252	static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253	"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254	cl::desc("We don't interleave loops with a estimated constant trip count "
255	"below this number"));
256
257	static cl::opt<unsigned> ForceTargetNumScalarRegs(
258	"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259	cl::desc("A flag that overrides the target's number of scalar registers."));
260
261	static cl::opt<unsigned> ForceTargetNumVectorRegs(
262	"force-target-num-vector-regs", cl::init(0), cl::Hidden,
263	cl::desc("A flag that overrides the target's number of vector registers."));
264
265	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266	"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267	cl::desc("A flag that overrides the target's max interleave factor for "
268	"scalar loops."));
269
270	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271	"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272	cl::desc("A flag that overrides the target's max interleave factor for "
273	"vectorized loops."));
274
275	static cl::opt<unsigned> ForceTargetInstructionCost(
276	"force-target-instruction-cost", cl::init(0), cl::Hidden,
277	cl::desc("A flag that overrides the target's expected cost for "
278	"an instruction to a single constant value. Mostly "
279	"useful for getting consistent testing."));
280
281	static cl::opt<bool> ForceTargetSupportsScalableVectors(
282	"force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283	cl::desc(
284	"Pretend that scalable vectors are supported, even if the target does "
285	"not support them. This flag should only be used for testing."));
286
287	static cl::opt<unsigned> SmallLoopCost(
288	"small-loop-cost", cl::init(20), cl::Hidden,
289	cl::desc(
290	"The cost of a loop that is considered 'small' by the interleaver."));
291
292	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293	"loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294	cl::desc("Enable the use of the block frequency analysis to access PGO "
295	"heuristics minimizing code growth in cold regions and being more "
296	"aggressive in hot regions."));
297
298	// Runtime interleave loops for load/store throughput.
299	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300	"enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301	cl::desc(
302	"Enable runtime interleaving until load/store ports are saturated"));
303
304	/// Interleave small loops with scalar reductions.
305	static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306	"interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307	cl::desc("Enable interleaving for loops with small iteration counts that "
308	"contain scalar reductions to expose ILP."));
309
310	/// The number of stores in a loop that are allowed to need predication.
311	static cl::opt<unsigned> NumberOfStoresToPredicate(
312	"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313	cl::desc("Max number of stores to be predicated behind an if."));
314
315	static cl::opt<bool> EnableIndVarRegisterHeur(
316	"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317	cl::desc("Count the induction variable only once when interleaving"));
318
319	static cl::opt<bool> EnableCondStoresVectorization(
320	"enable-cond-stores-vec", cl::init(true), cl::Hidden,
321	cl::desc("Enable if predication of stores during vectorization."));
322
323	static cl::opt<unsigned> MaxNestedScalarReductionIC(
324	"max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325	cl::desc("The maximum interleave count to use when interleaving a scalar "
326	"reduction in a nested loop."));
327
328	static cl::opt<bool>
329	PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330	cl::Hidden,
331	cl::desc("Prefer in-loop vector reductions, "
332	"overriding the targets preference."));
333
334	static cl::opt<bool> ForceOrderedReductions(
335	"force-ordered-reductions", cl::init(false), cl::Hidden,
336	cl::desc("Enable the vectorisation of loops with in-order (strict) "
337	"FP reductions"));
338
339	static cl::opt<bool> PreferPredicatedReductionSelect(
340	"prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341	cl::desc(
342	"Prefer predicating a reduction operation over an after loop select."));
343
344	cl::opt<bool> EnableVPlanNativePath(
345	"enable-vplan-native-path", cl::init(false), cl::Hidden,
346	cl::desc("Enable VPlan-native vectorization path with "
347	"support for outer loop vectorization."));
348
349	// FIXME: Remove this switch once we have divergence analysis. Currently we
350	// assume divergent non-backedge branches when this switch is true.
351	cl::opt<bool> EnableVPlanPredication(
352	"enable-vplan-predication", cl::init(false), cl::Hidden,
353	cl::desc("Enable VPlan-native vectorization path predicator with "
354	"support for outer loop vectorization."));
355
356	// This flag enables the stress testing of the VPlan H-CFG construction in the
357	// VPlan-native vectorization path. It must be used in conjuction with
358	// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359	// verification of the H-CFGs built.
360	static cl::opt<bool> VPlanBuildStressTest(
361	"vplan-build-stress-test", cl::init(false), cl::Hidden,
362	cl::desc(
363	"Build VPlan for every supported loop nest in the function and bail "
364	"out right after the build (stress test the VPlan H-CFG construction "
365	"in the VPlan-native vectorization path)."));
366
367	cl::opt<bool> llvm::EnableLoopInterleaving(
368	"interleave-loops", cl::init(true), cl::Hidden,
369	cl::desc("Enable loop interleaving in Loop vectorization passes"));
370	cl::opt<bool> llvm::EnableLoopVectorization(
371	"vectorize-loops", cl::init(true), cl::Hidden,
372	cl::desc("Run the Loop vectorization passes"));
373
374	cl::opt<bool> PrintVPlansInDotFormat(
375	"vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376	cl::desc("Use dot format instead of plain text when dumping VPlans"));
377
378	/// A helper function that returns true if the given type is irregular. The
379	/// type is irregular if its allocated size doesn't equal the store size of an
380	/// element of the corresponding vector type.
381	static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382	// Determine if an array of N elements of type Ty is "bitcast compatible"
383	// with a <N x Ty> vector.
384	// This is only true if there is no padding between the array elements.
385	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386	}
387
388	/// A helper function that returns the reciprocal of the block probability of
389	/// predicated blocks. If we return X, we are assuming the predicated block
390	/// will execute once for every X iterations of the loop header.
391	///
392	/// TODO: We should use actual block probability here, if available. Currently,
393	/// we always assume predicated blocks have a 50% chance of executing.
394	static unsigned getReciprocalPredBlockProb() { return 2; }
395
396	/// A helper function that returns an integer or floating-point constant with
397	/// value C.
398	static Constant getSignedIntOrFpConstant(Type Ty, int64_t C) {
399	return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400	: ConstantFP::get(Ty, C);
401	}
402
403	/// Returns "best known" trip count for the specified loop \p L as defined by
404	/// the following procedure:
405	/// 1) Returns exact trip count if it is known.
406	/// 2) Returns expected trip count according to profile data if any.
407	/// 3) Returns upper bound estimate if it is known.
408	/// 4) Returns None if all of the above failed.
409	static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410	// Check if exact trip count is known.
411	if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412	return ExpectedTC;
413
414	// Check if there is an expected trip count available from profile data.
415	if (LoopVectorizeWithBlockFrequency)
416	if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417	return EstimatedTC;
418
419	// Check if upper bound estimate is known.
420	if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421	return ExpectedTC;
422
423	return None;
424	}
425
426	// Forward declare GeneratedRTChecks.
427	class GeneratedRTChecks;
428
429	namespace llvm {
430
431	AnalysisKey ShouldRunExtraVectorPasses::Key;
432
433	/// InnerLoopVectorizer vectorizes loops which contain only one basic
434	/// block to a specified vectorization factor (VF).
435	/// This class performs the widening of scalars into vectors, or multiple
436	/// scalars. This class also implements the following features:
437	/// * It inserts an epilogue loop for handling loops that don't have iteration
438	/// counts that are known to be a multiple of the vectorization factor.
439	/// * It handles the code generation for reduction variables.
440	/// * Scalarization (implementation using scalars) of un-vectorizable
441	/// instructions.
442	/// InnerLoopVectorizer does not perform any vectorization-legality
443	/// checks, and relies on the caller to check for the different legality
444	/// aspects. The InnerLoopVectorizer relies on the
445	/// LoopVectorizationLegality class to provide information about the induction
446	/// and reduction variables that were found to a given vectorization factor.
447	class InnerLoopVectorizer {
448	public:
449	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450	LoopInfo LI, DominatorTree DT,
451	const TargetLibraryInfo *TLI,
452	const TargetTransformInfo TTI, AssumptionCache AC,
453	OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454	unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
456	ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458	AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459	Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460	PSI(PSI), RTChecks(RTChecks) {
461	// Query this against the original loop and save it here because the profile
462	// of the original loop header may change as the transformation happens.
463	OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464	OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465	}
466
467	virtual ~InnerLoopVectorizer() = default;
468
469	/// Create a new empty loop that will contain vectorized instructions later
470	/// on, while the old loop will be used as the scalar remainder. Control flow
471	/// is generated around the vectorized (and scalar epilogue) loops consisting
472	/// of various checks and bypasses. Return the pre-header block of the new
473	/// loop and the start value for the canonical induction, if it is != 0. The
474	/// latter is the case when vectorizing the epilogue loop. In the case of
475	/// epilogue vectorization, this function is overriden to handle the more
476	/// complex control flow around the loops.
477	virtual std::pair<BasicBlock , Value > createVectorizedLoopSkeleton();
478
479	/// Widen a single call instruction within the innermost loop.
480	void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
481	VPTransformState &State);
482
483	/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484	void fixVectorizedLoop(VPTransformState &State);
485
486	// Return true if any runtime check is added.
487	bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488
489	/// A type for vectorized values in the new loop. Each value from the
490	/// original loop, when vectorized, is represented by UF vector values in the
491	/// new unrolled loop, where UF is the unroll factor.
492	using VectorParts = SmallVector<Value *, 2>;
493
494	/// Vectorize a single first-order recurrence or pointer induction PHINode in
495	/// a block. This method handles the induction variable canonicalization. It
496	/// supports both VF = 1 for unrolled loops and arbitrary length vectors.
497	void widenPHIInstruction(Instruction PN, VPWidenPHIRecipe PhiR,
498	VPTransformState &State);
499
500	/// A helper function to scalarize a single Instruction in the innermost loop.
501	/// Generates a sequence of scalar instances for each lane between \p MinLane
502	/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
503	/// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
504	/// Instr's operands.
505	void scalarizeInstruction(Instruction Instr, VPReplicateRecipe RepRecipe,
506	const VPIteration &Instance, bool IfPredicateInstr,
507	VPTransformState &State);
508
509	/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
510	/// is provided, the integer induction variable will first be truncated to
511	/// the corresponding type. \p CanonicalIV is the scalar value generated for
512	/// the canonical induction variable.
513	void widenIntOrFpInduction(PHINode IV, VPWidenIntOrFpInductionRecipe Def,
514	VPTransformState &State, Value *CanonicalIV);
515
516	/// Construct the vector value of a scalarized value \p V one lane at a time.
517	void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
518	VPTransformState &State);
519
520	/// Try to vectorize interleaved access group \p Group with the base address
521	/// given in \p Addr, optionally masking the vector operations if \p
522	/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
523	/// values in the vectorized loop.
524	void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
525	ArrayRef<VPValue *> VPDefs,
526	VPTransformState &State, VPValue *Addr,
527	ArrayRef<VPValue *> StoredValues,
528	VPValue *BlockInMask = nullptr);
529
530	/// Set the debug location in the builder \p Ptr using the debug location in
531	/// \p V. If \p Ptr is None then it uses the class member's Builder.
532	void setDebugLocFromInst(const Value *V,
533	Optional<IRBuilder<> *> CustomBuilder = None);
534
535	/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
536	void fixNonInductionPHIs(VPTransformState &State);
537
538	/// Returns true if the reordering of FP operations is not allowed, but we are
539	/// able to vectorize with strict in-order reductions for the given RdxDesc.
540	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
541
542	/// Create a broadcast instruction. This method generates a broadcast
543	/// instruction (shuffle) for loop invariant values and for the induction
544	/// value. If this is the induction variable then we extend it to N, N+1, ...
545	/// this is needed because each iteration in the loop corresponds to a SIMD
546	/// element.
547	virtual Value getBroadcastInstrs(Value V);
548
549	/// Add metadata from one instruction to another.
550	///
551	/// This includes both the original MDs from \p From and additional ones (\see
552	/// addNewMetadata). Use this for newly created instructions in the vector
553	/// loop.
554	void addMetadata(Instruction To, Instruction From);
555
556	/// Similar to the previous function but it adds the metadata to a
557	/// vector of instructions.
558	void addMetadata(ArrayRef<Value > To, Instruction From);
559
560	// Returns the resume value (bc.merge.rdx) for a reduction as
561	// generated by fixReduction.
562	PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
563
564	protected:
565	friend class LoopVectorizationPlanner;
566
567	/// A small list of PHINodes.
568	using PhiVector = SmallVector<PHINode *, 4>;
569
570	/// A type for scalarized values in the new loop. Each value from the
571	/// original loop, when scalarized, is represented by UF x VF scalar values
572	/// in the new unrolled loop, where UF is the unroll factor and VF is the
573	/// vectorization factor.
574	using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575
576	/// Set up the values of the IVs correctly when exiting the vector loop.
577	void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578	Value CountRoundDown, Value EndValue,
579	BasicBlock *MiddleBlock);
580
581	/// Introduce a conditional branch (on true, condition to be set later) at the
582	/// end of the header=latch connecting it to itself (across the backedge) and
583	/// to the exit block of \p L.
584	void createHeaderBranch(Loop *L);
585
586	/// Handle all cross-iteration phis in the header.
587	void fixCrossIterationPHIs(VPTransformState &State);
588
589	/// Create the exit value of first order recurrences in the middle block and
590	/// update their users.
591	void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
592	VPTransformState &State);
593
594	/// Create code for the loop exit value of the reduction.
595	void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
596
597	/// Clear NSW/NUW flags from reduction instructions if necessary.
598	void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
599	VPTransformState &State);
600
601	/// Fixup the LCSSA phi nodes in the unique exit block. This simply
602	/// means we need to add the appropriate incoming value from the middle
603	/// block as exiting edges from the scalar epilogue loop (if present) are
604	/// already in place, and we exit the vector loop exclusively to the middle
605	/// block.
606	void fixLCSSAPHIs(VPTransformState &State);
607
608	/// Iteratively sink the scalarized operands of a predicated instruction into
609	/// the block that was created for it.
610	void sinkScalarOperands(Instruction *PredInst);
611
612	/// Shrinks vector element sizes to the smallest bitwidth they can be legally
613	/// represented as.
614	void truncateToMinimalBitwidths(VPTransformState &State);
615
616	/// Compute scalar induction steps. \p ScalarIV is the scalar induction
617	/// variable on which to base the steps, \p Step is the size of the step, and
618	/// \p EntryVal is the value from the original loop that maps to the steps.
619	/// Note that \p EntryVal doesn't have to be an induction variable - it
620	/// can also be a truncate instruction.
621	void buildScalarSteps(Value ScalarIV, Value Step, Instruction *EntryVal,
622	const InductionDescriptor &ID, VPValue *Def,
623	VPTransformState &State);
624
625	/// Create a vector induction phi node based on an existing scalar one. \p
626	/// EntryVal is the value from the original loop that maps to the vector phi
627	/// node, and \p Step is the loop-invariant step. If \p EntryVal is a
628	/// truncate instruction, instead of widening the original IV, we widen a
629	/// version of the IV truncated to \p EntryVal's type.
630	void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
631	Value Step, Value Start,
632	Instruction EntryVal, VPValue Def,
633	VPTransformState &State);
634
635	/// Returns (and creates if needed) the original loop trip count.
636	Value getOrCreateTripCount(Loop NewLoop);
637
638	/// Returns (and creates if needed) the trip count of the widened loop.
639	Value getOrCreateVectorTripCount(Loop NewLoop);
640
641	/// Returns a bitcasted value to the requested vector type.
642	/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
643	Value createBitOrPointerCast(Value V, VectorType *DstVTy,
644	const DataLayout &DL);
645
646	/// Emit a bypass check to see if the vector trip count is zero, including if
647	/// it overflows.
648	void emitMinimumIterationCountCheck(Loop L, BasicBlock Bypass);
649
650	/// Emit a bypass check to see if all of the SCEV assumptions we've
651	/// had to make are correct. Returns the block containing the checks or
652	/// nullptr if no checks have been added.
653	BasicBlock emitSCEVChecks(Loop L, BasicBlock *Bypass);
654
655	/// Emit bypass checks to check any memory assumptions we may have made.
656	/// Returns the block containing the checks or nullptr if no checks have been
657	/// added.
658	BasicBlock emitMemRuntimeChecks(Loop L, BasicBlock *Bypass);
659
660	/// Compute the transformed value of Index at offset StartValue using step
661	/// StepValue.
662	/// For integer induction, returns StartValue + Index * StepValue.
663	/// For pointer induction, returns StartValue[Index * StepValue].
664	/// FIXME: The newly created binary instructions should contain nsw/nuw
665	/// flags, which can be found from the original scalar operations.
666	Value emitTransformedIndex(IRBuilder<> &B, Value Index, ScalarEvolution *SE,
667	const DataLayout &DL,
668	const InductionDescriptor &ID,
669	BasicBlock *VectorHeader) const;
670
671	/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
672	/// vector loop preheader, middle block and scalar preheader. Also
673	/// allocate a loop object for the new vector loop and return it.
674	Loop *createVectorLoopSkeleton(StringRef Prefix);
675
676	/// Create new phi nodes for the induction variables to resume iteration count
677	/// in the scalar epilogue, from where the vectorized loop left off.
678	/// In cases where the loop skeleton is more complicated (eg. epilogue
679	/// vectorization) and the resume values can come from an additional bypass
680	/// block, the \p AdditionalBypass pair provides information about the bypass
681	/// block and the end value on the edge from bypass to this loop.
682	void createInductionResumeValues(
683	Loop *L,
684	std::pair<BasicBlock , Value > AdditionalBypass = {nullptr, nullptr});
685
686	/// Complete the loop skeleton by adding debug MDs, creating appropriate
687	/// conditional branches in the middle block, preparing the builder and
688	/// running the verifier. Take in the vector loop \p L as argument, and return
689	/// the preheader of the completed vector loop.
690	BasicBlock completeLoopSkeleton(Loop L, MDNode *OrigLoopID);
691
692	/// Add additional metadata to \p To that was not present on \p Orig.
693	///
694	/// Currently this is used to add the noalias annotations based on the
695	/// inserted memchecks. Use this for instructions that are cloned into the
696	/// vector loop.
697	void addNewMetadata(Instruction To, const Instruction Orig);
698
699	/// Collect poison-generating recipes that may generate a poison value that is
700	/// used after vectorization, even when their operands are not poison. Those
701	/// recipes meet the following conditions:
702	/// * Contribute to the address computation of a recipe generating a widen
703	/// memory load/store (VPWidenMemoryInstructionRecipe or
704	/// VPInterleaveRecipe).
705	/// * Such a widen memory load/store has at least one underlying Instruction
706	/// that is in a basic block that needs predication and after vectorization
707	/// the generated instruction won't be predicated.
708	void collectPoisonGeneratingRecipes(VPTransformState &State);
709
710	/// Allow subclasses to override and print debug traces before/after vplan
711	/// execution, when trace information is requested.
712	virtual void printDebugTracesAtStart(){};
713	virtual void printDebugTracesAtEnd(){};
714
715	/// The original loop.
716	Loop *OrigLoop;
717
718	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
719	/// dynamic knowledge to simplify SCEV expressions and converts them to a
720	/// more usable form.
721	PredicatedScalarEvolution &PSE;
722
723	/// Loop Info.
724	LoopInfo *LI;
725
726	/// Dominator Tree.
727	DominatorTree *DT;
728
729	/// Alias Analysis.
730	AAResults *AA;
731
732	/// Target Library Info.
733	const TargetLibraryInfo *TLI;
734
735	/// Target Transform Info.
736	const TargetTransformInfo *TTI;
737
738	/// Assumption Cache.
739	AssumptionCache *AC;
740
741	/// Interface to emit optimization remarks.
742	OptimizationRemarkEmitter *ORE;
743
744	/// LoopVersioning. It's only set up (non-null) if memchecks were
745	/// used.
746	///
747	/// This is currently only used to add no-alias metadata based on the
748	/// memchecks. The actually versioning is performed manually.
749	std::unique_ptr<LoopVersioning> LVer;
750
751	/// The vectorization SIMD factor to use. Each vector will have this many
752	/// vector elements.
753	ElementCount VF;
754
755	/// The vectorization unroll factor to use. Each scalar is vectorized to this
756	/// many different vector instructions.
757	unsigned UF;
758
759	/// The builder that we use
760	IRBuilder<> Builder;
761
762	// --- Vectorization state ---
763
764	/// The vector-loop preheader.
765	BasicBlock *LoopVectorPreHeader;
766
767	/// The scalar-loop preheader.
768	BasicBlock *LoopScalarPreHeader;
769
770	/// Middle Block between the vector and the scalar.
771	BasicBlock *LoopMiddleBlock;
772
773	/// The unique ExitBlock of the scalar loop if one exists. Note that
774	/// there can be multiple exiting edges reaching this block.
775	BasicBlock *LoopExitBlock;
776
777	/// The vector loop body.
778	BasicBlock *LoopVectorBody;
779
780	/// The scalar loop body.
781	BasicBlock *LoopScalarBody;
782
783	/// A list of all bypass blocks. The first block is the entry of the loop.
784	SmallVector<BasicBlock *, 4> LoopBypassBlocks;
785
786	/// Store instructions that were predicated.
787	SmallVector<Instruction *, 4> PredicatedInstructions;
788
789	/// Trip count of the original loop.
790	Value *TripCount = nullptr;
791
792	/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
793	Value *VectorTripCount = nullptr;
794
795	/// The legality analysis.
796	LoopVectorizationLegality *Legal;
797
798	/// The profitablity analysis.
799	LoopVectorizationCostModel *Cost;
800
801	// Record whether runtime checks are added.
802	bool AddedSafetyChecks = false;
803
804	// Holds the end values for each induction variable. We save the end values
805	// so we can later fix-up the external users of the induction variables.
806	DenseMap<PHINode , Value > IVEndValues;
807
808	// Vector of original scalar PHIs whose corresponding widened PHIs need to be
809	// fixed up at the end of vector code generation.
810	SmallVector<PHINode *, 8> OrigPHIsToFix;
811
812	/// BFI and PSI are used to check for profile guided size optimizations.
813	BlockFrequencyInfo *BFI;
814	ProfileSummaryInfo *PSI;
815
816	// Whether this loop should be optimized for size based on profile guided size
817	// optimizatios.
818	bool OptForSizeBasedOnProfile;
819
820	/// Structure to hold information about generated runtime checks, responsible
821	/// for cleaning the checks, if vectorization turns out unprofitable.
822	GeneratedRTChecks &RTChecks;
823
824	// Holds the resume values for reductions in the loops, used to set the
825	// correct start value of reduction PHIs when vectorizing the epilogue.
826	SmallMapVector<const RecurrenceDescriptor , PHINode , 4>
827	ReductionResumeValues;
828	};
829
830	class InnerLoopUnroller : public InnerLoopVectorizer {
831	public:
832	InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
833	LoopInfo LI, DominatorTree DT,
834	const TargetLibraryInfo *TLI,
835	const TargetTransformInfo TTI, AssumptionCache AC,
836	OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
837	LoopVectorizationLegality *LVL,
838	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
839	ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
840	: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
841	ElementCount::getFixed(1), UnrollFactor, LVL, CM,
842	BFI, PSI, Check) {}
843
844	private:
845	Value getBroadcastInstrs(Value V) override;
846	};
847
848	/// Encapsulate information regarding vectorization of a loop and its epilogue.
849	/// This information is meant to be updated and used across two stages of
850	/// epilogue vectorization.
851	struct EpilogueLoopVectorizationInfo {
852	ElementCount MainLoopVF = ElementCount::getFixed(0);
853	unsigned MainLoopUF = 0;
854	ElementCount EpilogueVF = ElementCount::getFixed(0);
855	unsigned EpilogueUF = 0;
856	BasicBlock *MainLoopIterationCountCheck = nullptr;
857	BasicBlock *EpilogueIterationCountCheck = nullptr;
858	BasicBlock *SCEVSafetyCheck = nullptr;
859	BasicBlock *MemSafetyCheck = nullptr;
860	Value *TripCount = nullptr;
861	Value *VectorTripCount = nullptr;
862
863	EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
864	ElementCount EVF, unsigned EUF)
865	: MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
866	assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial." ) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 867, __extension__ __PRETTY_FUNCTION__))
867	"A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial." ) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 867, __extension__ __PRETTY_FUNCTION__));
868	}
869	};
870
871	/// An extension of the inner loop vectorizer that creates a skeleton for a
872	/// vectorized loop that has its epilogue (residual) also vectorized.
873	/// The idea is to run the vplan on a given loop twice, firstly to setup the
874	/// skeleton and vectorize the main loop, and secondly to complete the skeleton
875	/// from the first step and vectorize the epilogue. This is achieved by
876	/// deriving two concrete strategy classes from this base class and invoking
877	/// them in succession from the loop vectorizer planner.
878	class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
879	public:
880	InnerLoopAndEpilogueVectorizer(
881	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
882	DominatorTree DT, const TargetLibraryInfo TLI,
883	const TargetTransformInfo TTI, AssumptionCache AC,
884	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
885	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
886	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
887	GeneratedRTChecks &Checks)
888	: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
889	EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
890	Checks),
891	EPI(EPI) {}
892
893	// Override this function to handle the more complex control flow around the
894	// three loops.
895	std::pair<BasicBlock , Value >
896	createVectorizedLoopSkeleton() final override {
897	return createEpilogueVectorizedLoopSkeleton();
898	}
899
900	/// The interface for creating a vectorized skeleton using one of two
901	/// different strategies, each corresponding to one execution of the vplan
902	/// as described above.
903	virtual std::pair<BasicBlock , Value >
904	createEpilogueVectorizedLoopSkeleton() = 0;
905
906	/// Holds and updates state information required to vectorize the main loop
907	/// and its epilogue in two separate passes. This setup helps us avoid
908	/// regenerating and recomputing runtime safety checks. It also helps us to
909	/// shorten the iteration-count-check path length for the cases where the
910	/// iteration count of the loop is so small that the main vector loop is
911	/// completely skipped.
912	EpilogueLoopVectorizationInfo &EPI;
913	};
914
915	/// A specialized derived class of inner loop vectorizer that performs
916	/// vectorization of main loops in the process of vectorizing loops and their
917	/// epilogues.
918	class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
919	public:
920	EpilogueVectorizerMainLoop(
921	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
922	DominatorTree DT, const TargetLibraryInfo TLI,
923	const TargetTransformInfo TTI, AssumptionCache AC,
924	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
925	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
926	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
927	GeneratedRTChecks &Check)
928	: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
929	EPI, LVL, CM, BFI, PSI, Check) {}
930	/// Implements the interface for creating a vectorized skeleton using the
931	/// main loop strategy (ie the first pass of vplan execution).
932	std::pair<BasicBlock , Value >
933	createEpilogueVectorizedLoopSkeleton() final override;
934
935	protected:
936	/// Emits an iteration count bypass check once for the main loop (when \p
937	/// ForEpilogue is false) and once for the epilogue loop (when \p
938	/// ForEpilogue is true).
939	BasicBlock emitMinimumIterationCountCheck(Loop L, BasicBlock *Bypass,
940	bool ForEpilogue);
941	void printDebugTracesAtStart() override;
942	void printDebugTracesAtEnd() override;
943	};
944
945	// A specialized derived class of inner loop vectorizer that performs
946	// vectorization of epilogue loops in the process of vectorizing loops and
947	// their epilogues.
948	class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
949	public:
950	EpilogueVectorizerEpilogueLoop(
951	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
952	DominatorTree DT, const TargetLibraryInfo TLI,
953	const TargetTransformInfo TTI, AssumptionCache AC,
954	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
955	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
956	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
957	GeneratedRTChecks &Checks)
958	: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
959	EPI, LVL, CM, BFI, PSI, Checks) {}
960	/// Implements the interface for creating a vectorized skeleton using the
961	/// epilogue loop strategy (ie the second pass of vplan execution).
962	std::pair<BasicBlock , Value >
963	createEpilogueVectorizedLoopSkeleton() final override;
964
965	protected:
966	/// Emits an iteration count bypass check after the main vector loop has
967	/// finished to see if there are any iterations left to execute by either
968	/// the vector epilogue or the scalar epilogue.
969	BasicBlock emitMinimumVectorEpilogueIterCountCheck(Loop L,
970	BasicBlock *Bypass,
971	BasicBlock *Insert);
972	void printDebugTracesAtStart() override;
973	void printDebugTracesAtEnd() override;
974	};
975	} // end namespace llvm
976
977	/// Look for a meaningful debug location on the instruction or it's
978	/// operands.
979	static Instruction getDebugLocFromInstOrOperands(Instruction I) {
980	if (!I)
981	return I;
982
983	DebugLoc Empty;
984	if (I->getDebugLoc() != Empty)
985	return I;
986
987	for (Use &Op : I->operands()) {
988	if (Instruction *OpInst = dyn_cast<Instruction>(Op))
989	if (OpInst->getDebugLoc() != Empty)
990	return OpInst;
991	}
992
993	return I;
994	}
995
996	void InnerLoopVectorizer::setDebugLocFromInst(
997	const Value V, Optional<IRBuilder<> > CustomBuilder) {
998	IRBuilder<> B = (CustomBuilder == None) ? &Builder : CustomBuilder;
999	if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1000	const DILocation *DIL = Inst->getDebugLoc();
1001
1002	// When a FSDiscriminator is enabled, we don't need to add the multiply
1003	// factors to the discriminators.
1004	if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1005	!isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1006	// FIXME: For scalable vectors, assume vscale=1.
1007	auto NewDIL =
1008	DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1009	if (NewDIL)
1010	B->SetCurrentDebugLocation(NewDIL.getValue());
1011	else
1012	LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Failed to create new discriminator: " << DIL->getFilename() << " Line: " << DIL ->getLine(); } } while (false)
1013	<< "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Failed to create new discriminator: " << DIL->getFilename() << " Line: " << DIL ->getLine(); } } while (false)
1014	<< DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Failed to create new discriminator: " << DIL->getFilename() << " Line: " << DIL ->getLine(); } } while (false);
1015	} else
1016	B->SetCurrentDebugLocation(DIL);
1017	} else
1018	B->SetCurrentDebugLocation(DebugLoc());
1019	}
1020
1021	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1022	/// is passed, the message relates to that particular instruction.
1023	#ifndef NDEBUG
1024	static void debugVectorizationMessage(const StringRef Prefix,
1025	const StringRef DebugMsg,
1026	Instruction *I) {
1027	dbgs() << "LV: " << Prefix << DebugMsg;
1028	if (I != nullptr)
1029	dbgs() << " " << *I;
1030	else
1031	dbgs() << '.';
1032	dbgs() << '\n';
1033	}
1034	#endif
1035
1036	/// Create an analysis remark that explains why vectorization failed
1037	///
1038	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1039	/// RemarkName is the identifier for the remark. If \p I is passed it is an
1040	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1041	/// the location of the remark. \return the remark object that can be
1042	/// streamed to.
1043	static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1044	StringRef RemarkName, Loop TheLoop, Instruction I) {
1045	Value *CodeRegion = TheLoop->getHeader();
1046	DebugLoc DL = TheLoop->getStartLoc();
1047
1048	if (I) {
1049	CodeRegion = I->getParent();
1050	// If there is no debug location attached to the instruction, revert back to
1051	// using the loop's.
1052	if (I->getDebugLoc())
1053	DL = I->getDebugLoc();
1054	}
1055
1056	return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1057	}
1058
1059	namespace llvm {
1060
1061	/// Return a value for Step multiplied by VF.
1062	Value createStepForVF(IRBuilder<> &B, Type Ty, ElementCount VF,
1063	int64_t Step) {
1064	assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step" ) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1064, __extension__ __PRETTY_FUNCTION__));
1065	Constant StepVal = ConstantInt::get(Ty, Step VF.getKnownMinValue());
1066	return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1067	}
1068
1069	/// Return the runtime value for VF.
1070	Value getRuntimeVF(IRBuilder<> &B, Type Ty, ElementCount VF) {
1071	Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1072	return VF.isScalable() ? B.CreateVScale(EC) : EC;
1073	}
1074
1075	static Value getRuntimeVFAsFloat(IRBuilder<> &B, Type FTy, ElementCount VF) {
1076	assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() && "Expected floating point type!") ? void (0) : __assert_fail ( "FTy->isFloatingPointTy() && \"Expected floating point type!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1076, __extension__ __PRETTY_FUNCTION__));
1077	Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1078	Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1079	return B.CreateUIToFP(RuntimeVF, FTy);
1080	}
1081
1082	void reportVectorizationFailure(const StringRef DebugMsg,
1083	const StringRef OREMsg, const StringRef ORETag,
1084	OptimizationRemarkEmitter ORE, Loop TheLoop,
1085	Instruction *I) {
1086	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: " , DebugMsg, I); } } while (false);
1087	LoopVectorizeHints Hints(TheLoop, true /* doesn't matter /, ORE);
1088	ORE->emit(
1089	createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1090	<< "loop not vectorized: " << OREMsg);
1091	}
1092
1093	void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1094	OptimizationRemarkEmitter ORE, Loop TheLoop,
1095	Instruction *I) {
1096	LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { debugVectorizationMessage("", Msg, I); } } while (false);
1097	LoopVectorizeHints Hints(TheLoop, true /* doesn't matter /, ORE);
1098	ORE->emit(
1099	createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1100	<< Msg);
1101	}
1102
1103	} // end namespace llvm
1104
1105	#ifndef NDEBUG
1106	/// \return string containing a file name and a line # for the given loop.
1107	static std::string getDebugLocString(const Loop *L) {
1108	std::string Result;
1109	if (L) {
1110	raw_string_ostream OS(Result);
1111	if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1112	LoopDbgLoc.print(OS);
1113	else
1114	// Just print the module name.
1115	OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1116	OS.flush();
1117	}
1118	return Result;
1119	}
1120	#endif
1121
1122	void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1123	const Instruction *Orig) {
1124	// If the loop was versioned with memchecks, add the corresponding no-alias
1125	// metadata.
1126	if (LVer && (isa<LoadInst>(Orig) \|\| isa<StoreInst>(Orig)))
1127	LVer->annotateInstWithNoAlias(To, Orig);
1128	}
1129
1130	void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1131	VPTransformState &State) {
1132
1133	// Collect recipes in the backward slice of `Root` that may generate a poison
1134	// value that is used after vectorization.
1135	SmallPtrSet<VPRecipeBase *, 16> Visited;
1136	auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1137	SmallVector<VPRecipeBase *, 16> Worklist;
1138	Worklist.push_back(Root);
1139
1140	// Traverse the backward slice of Root through its use-def chain.
1141	while (!Worklist.empty()) {
1142	VPRecipeBase *CurRec = Worklist.back();
1143	Worklist.pop_back();
1144
1145	if (!Visited.insert(CurRec).second)
1146	continue;
1147
1148	// Prune search if we find another recipe generating a widen memory
1149	// instruction. Widen memory instructions involved in address computation
1150	// will lead to gather/scatter instructions, which don't need to be
1151	// handled.
1152	if (isa<VPWidenMemoryInstructionRecipe>(CurRec) \|\|
1153	isa<VPInterleaveRecipe>(CurRec) \|\|
1154	isa<VPCanonicalIVPHIRecipe>(CurRec))
1155	continue;
1156
1157	// This recipe contributes to the address computation of a widen
1158	// load/store. Collect recipe if its underlying instruction has
1159	// poison-generating flags.
1160	Instruction *Instr = CurRec->getUnderlyingInstr();
1161	if (Instr && Instr->hasPoisonGeneratingFlags())
1162	State.MayGeneratePoisonRecipes.insert(CurRec);
1163
1164	// Add new definitions to the worklist.
1165	for (VPValue *operand : CurRec->operands())
1166	if (VPDef *OpDef = operand->getDef())
1167	Worklist.push_back(cast<VPRecipeBase>(OpDef));
1168	}
1169	});
1170
1171	// Traverse all the recipes in the VPlan and collect the poison-generating
1172	// recipes in the backward slice starting at the address of a VPWidenRecipe or
1173	// VPInterleaveRecipe.
1174	auto Iter = depth_first(
1175	VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1176	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1177	for (VPRecipeBase &Recipe : *VPBB) {
1178	if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1179	Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1180	VPDef *AddrDef = WidenRec->getAddr()->getDef();
1181	if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1182	Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1183	collectPoisonGeneratingInstrsInBackwardSlice(
1184	cast<VPRecipeBase>(AddrDef));
1185	} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1186	VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1187	if (AddrDef) {
1188	// Check if any member of the interleave group needs predication.
1189	const InterleaveGroup<Instruction> *InterGroup =
1190	InterleaveRec->getInterleaveGroup();
1191	bool NeedPredication = false;
1192	for (int I = 0, NumMembers = InterGroup->getNumMembers();
1193	I < NumMembers; ++I) {
1194	Instruction *Member = InterGroup->getMember(I);
1195	if (Member)
1196	NeedPredication \|=
1197	Legal->blockNeedsPredication(Member->getParent());
1198	}
1199
1200	if (NeedPredication)
1201	collectPoisonGeneratingInstrsInBackwardSlice(
1202	cast<VPRecipeBase>(AddrDef));
1203	}
1204	}
1205	}
1206	}
1207	}
1208
1209	void InnerLoopVectorizer::addMetadata(Instruction *To,
1210	Instruction *From) {
1211	propagateMetadata(To, From);
1212	addNewMetadata(To, From);
1213	}
1214
1215	void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1216	Instruction *From) {
1217	for (Value *V : To) {
1218	if (Instruction *I = dyn_cast<Instruction>(V))
1219	addMetadata(I, From);
1220	}
1221	}
1222
1223	PHINode *InnerLoopVectorizer::getReductionResumeValue(
1224	const RecurrenceDescriptor &RdxDesc) {
1225	auto It = ReductionResumeValues.find(&RdxDesc);
1226	assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() && "Expected to find a resume value for the reduction.") ? void (0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1227, __extension__ __PRETTY_FUNCTION__))
1227	"Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() && "Expected to find a resume value for the reduction.") ? void (0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1227, __extension__ __PRETTY_FUNCTION__));
1228	return It->second;
1229	}
1230
1231	namespace llvm {
1232
1233	// Loop vectorization cost-model hints how the scalar epilogue loop should be
1234	// lowered.
1235	enum ScalarEpilogueLowering {
1236
1237	// The default: allowing scalar epilogues.
1238	CM_ScalarEpilogueAllowed,
1239
1240	// Vectorization with OptForSize: don't allow epilogues.
1241	CM_ScalarEpilogueNotAllowedOptSize,
1242
1243	// A special case of vectorisation with OptForSize: loops with a very small
1244	// trip count are considered for vectorization under OptForSize, thereby
1245	// making sure the cost of their loop body is dominant, free of runtime
1246	// guards and scalar iteration overheads.
1247	CM_ScalarEpilogueNotAllowedLowTripLoop,
1248
1249	// Loop hint predicate indicating an epilogue is undesired.
1250	CM_ScalarEpilogueNotNeededUsePredicate,
1251
1252	// Directive indicating we must either tail fold or not vectorize
1253	CM_ScalarEpilogueNotAllowedUsePredicate
1254	};
1255
1256	/// ElementCountComparator creates a total ordering for ElementCount
1257	/// for the purposes of using it in a set structure.
1258	struct ElementCountComparator {
1259	bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1260	return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1261	std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1262	}
1263	};
1264	using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1265
1266	/// LoopVectorizationCostModel - estimates the expected speedups due to
1267	/// vectorization.
1268	/// In many cases vectorization is not profitable. This can happen because of
1269	/// a number of reasons. In this class we mainly attempt to predict the
1270	/// expected speedup/slowdowns due to the supported instruction set. We use the
1271	/// TargetTransformInfo to query the different backends for the cost of
1272	/// different operations.
1273	class LoopVectorizationCostModel {
1274	public:
1275	LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1276	PredicatedScalarEvolution &PSE, LoopInfo *LI,
1277	LoopVectorizationLegality *Legal,
1278	const TargetTransformInfo &TTI,
1279	const TargetLibraryInfo TLI, DemandedBits DB,
1280	AssumptionCache *AC,
1281	OptimizationRemarkEmitter ORE, const Function F,
1282	const LoopVectorizeHints *Hints,
1283	InterleavedAccessInfo &IAI)
1284	: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1285	TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1286	Hints(Hints), InterleaveInfo(IAI) {}
1287
1288	/// \return An upper bound for the vectorization factors (both fixed and
1289	/// scalable). If the factors are 0, vectorization and interleaving should be
1290	/// avoided up front.
1291	FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1292
1293	/// \return True if runtime checks are required for vectorization, and false
1294	/// otherwise.
1295	bool runtimeChecksRequired();
1296
1297	/// \return The most profitable vectorization factor and the cost of that VF.
1298	/// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1299	/// then this vectorization factor will be selected if vectorization is
1300	/// possible.
1301	VectorizationFactor
1302	selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1303
1304	VectorizationFactor
1305	selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1306	const LoopVectorizationPlanner &LVP);
1307
1308	/// Setup cost-based decisions for user vectorization factor.
1309	/// \return true if the UserVF is a feasible VF to be chosen.
1310	bool selectUserVectorizationFactor(ElementCount UserVF) {
1311	collectUniformsAndScalars(UserVF);
1312	collectInstsToScalarize(UserVF);
1313	return expectedCost(UserVF).first.isValid();
1314	}
1315
1316	/// \return The size (in bits) of the smallest and widest types in the code
1317	/// that needs to be vectorized. We ignore values that remain scalar such as
1318	/// 64 bit loop indices.
1319	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1320
1321	/// \return The desired interleave count.
1322	/// If interleave count has been specified by metadata it will be returned.
1323	/// Otherwise, the interleave count is computed and returned. VF and LoopCost
1324	/// are the selected vectorization factor and the cost of the selected VF.
1325	unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1326
1327	/// Memory access instruction may be vectorized in more than one way.
1328	/// Form of instruction after vectorization depends on cost.
1329	/// This function takes cost-based decisions for Load/Store instructions
1330	/// and collects them in a map. This decisions map is used for building
1331	/// the lists of loop-uniform and loop-scalar instructions.
1332	/// The calculated cost is saved with widening decision in order to
1333	/// avoid redundant calculations.
1334	void setCostBasedWideningDecision(ElementCount VF);
1335
1336	/// A struct that represents some properties of the register usage
1337	/// of a loop.
1338	struct RegisterUsage {
1339	/// Holds the number of loop invariant values that are used in the loop.
1340	/// The key is ClassID of target-provided register class.
1341	SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1342	/// Holds the maximum number of concurrent live intervals in the loop.
1343	/// The key is ClassID of target-provided register class.
1344	SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1345	};
1346
1347	/// \return Returns information about the register usages of the loop for the
1348	/// given vectorization factors.
1349	SmallVector<RegisterUsage, 8>
1350	calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1351
1352	/// Collect values we want to ignore in the cost model.
1353	void collectValuesToIgnore();
1354
1355	/// Collect all element types in the loop for which widening is needed.
1356	void collectElementTypesForWidening();
1357
1358	/// Split reductions into those that happen in the loop, and those that happen
1359	/// outside. In loop reductions are collected into InLoopReductionChains.
1360	void collectInLoopReductions();
1361
1362	/// Returns true if we should use strict in-order reductions for the given
1363	/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1364	/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1365	/// of FP operations.
1366	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1367	return !Hints->allowReordering() && RdxDesc.isOrdered();
1368	}
1369
1370	/// \returns The smallest bitwidth each instruction can be represented with.
1371	/// The vector equivalents of these instructions should be truncated to this
1372	/// type.
1373	const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1374	return MinBWs;
1375	}
1376
1377	/// \returns True if it is more profitable to scalarize instruction \p I for
1378	/// vectorization factor \p VF.
1379	bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1380	assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1." ) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1381, __extension__ __PRETTY_FUNCTION__))
1381	"Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1." ) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1381, __extension__ __PRETTY_FUNCTION__));
1382
1383	// Cost model is not run in the VPlan-native path - return conservative
1384	// result until this changes.
1385	if (EnableVPlanNativePath)
1386	return false;
1387
1388	auto Scalars = InstsToScalarize.find(VF);
1389	assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability") ? void (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1390, __extension__ __PRETTY_FUNCTION__))
1390	"VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability") ? void (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1390, __extension__ __PRETTY_FUNCTION__));
1391	return Scalars->second.find(I) != Scalars->second.end();
1392	}
1393
1394	/// Returns true if \p I is known to be uniform after vectorization.
1395	bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1396	if (VF.isScalar())
1397	return true;
1398
1399	// Cost model is not run in the VPlan-native path - return conservative
1400	// result until this changes.
1401	if (EnableVPlanNativePath)
1402	return false;
1403
1404	auto UniformsPerVF = Uniforms.find(VF);
1405	assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity") ? void (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1406, __extension__ __PRETTY_FUNCTION__))
1406	"VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity") ? void (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1406, __extension__ __PRETTY_FUNCTION__));
1407	return UniformsPerVF->second.count(I);
1408	}
1409
1410	/// Returns true if \p I is known to be scalar after vectorization.
1411	bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1412	if (VF.isScalar())
1413	return true;
1414
1415	// Cost model is not run in the VPlan-native path - return conservative
1416	// result until this changes.
1417	if (EnableVPlanNativePath)
1418	return false;
1419
1420	auto ScalarsPerVF = Scalars.find(VF);
1421	assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF") ? void (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1422, __extension__ __PRETTY_FUNCTION__))
1422	"Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF") ? void (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1422, __extension__ __PRETTY_FUNCTION__));
1423	return ScalarsPerVF->second.count(I);
1424	}
1425
1426	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
1427	/// for vectorization factor \p VF.
1428	bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1429	return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1430	!isProfitableToScalarize(I, VF) &&
1431	!isScalarAfterVectorization(I, VF);
1432	}
1433
1434	/// Decision that was taken during cost calculation for memory instruction.
1435	enum InstWidening {
1436	CM_Unknown,
1437	CM_Widen, // For consecutive accesses with stride +1.
1438	CM_Widen_Reverse, // For consecutive accesses with stride -1.
1439	CM_Interleave,
1440	CM_GatherScatter,
1441	CM_Scalarize
1442	};
1443
1444	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1445	/// instruction \p I and vector width \p VF.
1446	void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1447	InstructionCost Cost) {
1448	assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2" ) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1448, __extension__ __PRETTY_FUNCTION__));
1449	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1450	}
1451
1452	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1453	/// interleaving group \p Grp and vector width \p VF.
1454	void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1455	ElementCount VF, InstWidening W,
1456	InstructionCost Cost) {
1457	assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2" ) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1457, __extension__ __PRETTY_FUNCTION__));
1458	/// Broadcast this decicion to all instructions inside the group.
1459	/// But the cost will be assigned to one instruction only.
1460	for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1461	if (auto *I = Grp->getMember(i)) {
1462	if (Grp->getInsertPos() == I)
1463	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1464	else
1465	WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1466	}
1467	}
1468	}
1469
1470	/// Return the cost model decision for the given instruction \p I and vector
1471	/// width \p VF. Return CM_Unknown if this instruction did not pass
1472	/// through the cost modeling.
1473	InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1474	assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF" ) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1474, __extension__ __PRETTY_FUNCTION__));
1475	// Cost model is not run in the VPlan-native path - return conservative
1476	// result until this changes.
1477	if (EnableVPlanNativePath)
1478	return CM_GatherScatter;
1479
1480	std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1481	auto Itr = WideningDecisions.find(InstOnVF);
1482	if (Itr == WideningDecisions.end())
1483	return CM_Unknown;
1484	return Itr->second.first;
1485	}
1486
1487	/// Return the vectorization cost for the given instruction \p I and vector
1488	/// width \p VF.
1489	InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1490	assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2" ) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1490, __extension__ __PRETTY_FUNCTION__));
1491	std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1492	assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated" ) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1493, __extension__ __PRETTY_FUNCTION__))
1493	"The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated" ) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1493, __extension__ __PRETTY_FUNCTION__));
1494	return WideningDecisions[InstOnVF].second;
1495	}
1496
1497	/// Return True if instruction \p I is an optimizable truncate whose operand
1498	/// is an induction variable. Such a truncate will be removed by adding a new
1499	/// induction variable with the destination type.
1500	bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1501	// If the instruction is not a truncate, return false.
1502	auto *Trunc = dyn_cast<TruncInst>(I);
1503	if (!Trunc)
1504	return false;
1505
1506	// Get the source and destination types of the truncate.
1507	Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1508	Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1509
1510	// If the truncate is free for the given types, return false. Replacing a
1511	// free truncate with an induction variable would add an induction variable
1512	// update instruction to each iteration of the loop. We exclude from this
1513	// check the primary induction variable since it will need an update
1514	// instruction regardless.
1515	Value *Op = Trunc->getOperand(0);
1516	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1517	return false;
1518
1519	// If the truncated value is not an induction variable, return false.
1520	return Legal->isInductionPhi(Op);
1521	}
1522
1523	/// Collects the instructions to scalarize for each predicated instruction in
1524	/// the loop.
1525	void collectInstsToScalarize(ElementCount VF);
1526
1527	/// Collect Uniform and Scalar values for the given \p VF.
1528	/// The sets depend on CM decision for Load/Store instructions
1529	/// that may be vectorized as interleave, gather-scatter or scalarized.
1530	void collectUniformsAndScalars(ElementCount VF) {
1531	// Do the analysis once.
1532	if (VF.isScalar() \|\| Uniforms.find(VF) != Uniforms.end())
1533	return;
1534	setCostBasedWideningDecision(VF);
1535	collectLoopUniforms(VF);
1536	collectLoopScalars(VF);
1537	}
1538
1539	/// Returns true if the target machine supports masked store operation
1540	/// for the given \p DataType and kind of access to \p Ptr.
1541	bool isLegalMaskedStore(Type DataType, Value Ptr, Align Alignment) const {
1542	return Legal->isConsecutivePtr(DataType, Ptr) &&
1543	TTI.isLegalMaskedStore(DataType, Alignment);
1544	}
1545
1546	/// Returns true if the target machine supports masked load operation
1547	/// for the given \p DataType and kind of access to \p Ptr.
1548	bool isLegalMaskedLoad(Type DataType, Value Ptr, Align Alignment) const {
1549	return Legal->isConsecutivePtr(DataType, Ptr) &&
1550	TTI.isLegalMaskedLoad(DataType, Alignment);
1551	}
1552
1553	/// Returns true if the target machine can represent \p V as a masked gather
1554	/// or scatter operation.
1555	bool isLegalGatherOrScatter(Value *V,
1556	ElementCount VF = ElementCount::getFixed(1)) {
1557	bool LI = isa<LoadInst>(V);
1558	bool SI = isa<StoreInst>(V);
1559	if (!LI && !SI)
1560	return false;
1561	auto *Ty = getLoadStoreType(V);
1562	Align Align = getLoadStoreAlignment(V);
1563	if (VF.isVector())
1564	Ty = VectorType::get(Ty, VF);
1565	return (LI && TTI.isLegalMaskedGather(Ty, Align)) \|\|
1566	(SI && TTI.isLegalMaskedScatter(Ty, Align));
1567	}
1568
1569	/// Returns true if the target machine supports all of the reduction
1570	/// variables found for the given VF.
1571	bool canVectorizeReductions(ElementCount VF) const {
1572	return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1573	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1574	return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1575	}));
1576	}
1577
1578	/// Returns true if \p I is an instruction that will be scalarized with
1579	/// predication when vectorizing \p I with vectorization factor \p VF. Such
1580	/// instructions include conditional stores and instructions that may divide
1581	/// by zero.
1582	bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1583
1584	// Returns true if \p I is an instruction that will be predicated either
1585	// through scalar predication or masked load/store or masked gather/scatter.
1586	// \p VF is the vectorization factor that will be used to vectorize \p I.
1587	// Superset of instructions that return true for isScalarWithPredication.
1588	bool isPredicatedInst(Instruction *I, ElementCount VF,
1589	bool IsKnownUniform = false) {
1590	// When we know the load is uniform and the original scalar loop was not
1591	// predicated we don't need to mark it as a predicated instruction. Any
1592	// vectorised blocks created when tail-folding are something artificial we
1593	// have introduced and we know there is always at least one active lane.
1594	// That's why we call Legal->blockNeedsPredication here because it doesn't
1595	// query tail-folding.
1596	if (IsKnownUniform && isa<LoadInst>(I) &&
1597	!Legal->blockNeedsPredication(I->getParent()))
1598	return false;
1599	if (!blockNeedsPredicationForAnyReason(I->getParent()))
1600	return false;
1601	// Loads and stores that need some form of masked operation are predicated
1602	// instructions.
1603	if (isa<LoadInst>(I) \|\| isa<StoreInst>(I))
1604	return Legal->isMaskRequired(I);
1605	return isScalarWithPredication(I, VF);
1606	}
1607
1608	/// Returns true if \p I is a memory instruction with consecutive memory
1609	/// access that can be widened.
1610	bool
1611	memoryInstructionCanBeWidened(Instruction *I,
1612	ElementCount VF = ElementCount::getFixed(1));
1613
1614	/// Returns true if \p I is a memory instruction in an interleaved-group
1615	/// of memory accesses that can be vectorized with wide vector loads/stores
1616	/// and shuffles.
1617	bool
1618	interleavedAccessCanBeWidened(Instruction *I,
1619	ElementCount VF = ElementCount::getFixed(1));
1620
1621	/// Check if \p Instr belongs to any interleaved access group.
1622	bool isAccessInterleaved(Instruction *Instr) {
1623	return InterleaveInfo.isInterleaved(Instr);
1624	}
1625
1626	/// Get the interleaved access group that \p Instr belongs to.
1627	const InterleaveGroup<Instruction> *
1628	getInterleavedAccessGroup(Instruction *Instr) {
1629	return InterleaveInfo.getInterleaveGroup(Instr);
1630	}
1631
1632	/// Returns true if we're required to use a scalar epilogue for at least
1633	/// the final iteration of the original loop.
1634	bool requiresScalarEpilogue(ElementCount VF) const {
1635	if (!isScalarEpilogueAllowed())
1636	return false;
1637	// If we might exit from anywhere but the latch, must run the exiting
1638	// iteration in scalar form.
1639	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1640	return true;
1641	return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1642	}
1643
1644	/// Returns true if a scalar epilogue is not allowed due to optsize or a
1645	/// loop hint annotation.
1646	bool isScalarEpilogueAllowed() const {
1647	return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1648	}
1649
1650	/// Returns true if all loop blocks should be masked to fold tail loop.
1651	bool foldTailByMasking() const { return FoldTailByMasking; }
1652
1653	/// Returns true if the instructions in this block requires predication
1654	/// for any reason, e.g. because tail folding now requires a predicate
1655	/// or because the block in the original loop was predicated.
1656	bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1657	return foldTailByMasking() \|\| Legal->blockNeedsPredication(BB);
1658	}
1659
1660	/// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1661	/// nodes to the chain of instructions representing the reductions. Uses a
1662	/// MapVector to ensure deterministic iteration order.
1663	using ReductionChainMap =
1664	SmallMapVector<PHINode , SmallVector<Instruction , 4>, 4>;
1665
1666	/// Return the chain of instructions representing an inloop reduction.
1667	const ReductionChainMap &getInLoopReductionChains() const {
1668	return InLoopReductionChains;
1669	}
1670
1671	/// Returns true if the Phi is part of an inloop reduction.
1672	bool isInLoopReduction(PHINode *Phi) const {
1673	return InLoopReductionChains.count(Phi);
1674	}
1675
1676	/// Estimate cost of an intrinsic call instruction CI if it were vectorized
1677	/// with factor VF. Return the cost of the instruction, including
1678	/// scalarization overhead if it's needed.
1679	InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1680
1681	/// Estimate cost of a call instruction CI if it were vectorized with factor
1682	/// VF. Return the cost of the instruction, including scalarization overhead
1683	/// if it's needed. The flag NeedToScalarize shows if the call needs to be
1684	/// scalarized -
1685	/// i.e. either vector version isn't available, or is too expensive.
1686	InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1687	bool &NeedToScalarize) const;
1688
1689	/// Returns true if the per-lane cost of VectorizationFactor A is lower than
1690	/// that of B.
1691	bool isMoreProfitable(const VectorizationFactor &A,
1692	const VectorizationFactor &B) const;
1693
1694	/// Invalidates decisions already taken by the cost model.
1695	void invalidateCostModelingDecisions() {
1696	WideningDecisions.clear();
1697	Uniforms.clear();
1698	Scalars.clear();
1699	}
1700
1701	private:
1702	unsigned NumPredStores = 0;
1703
1704	/// Convenience function that returns the value of vscale_range iff
1705	/// vscale_range.min == vscale_range.max or otherwise returns the value
1706	/// returned by the corresponding TLI method.
1707	Optional<unsigned> getVScaleForTuning() const;
1708
1709	/// \return An upper bound for the vectorization factors for both
1710	/// fixed and scalable vectorization, where the minimum-known number of
1711	/// elements is a power-of-2 larger than zero. If scalable vectorization is
1712	/// disabled or unsupported, then the scalable part will be equal to
1713	/// ElementCount::getScalable(0).
1714	FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1715	ElementCount UserVF,
1716	bool FoldTailByMasking);
1717
1718	/// \return the maximized element count based on the targets vector
1719	/// registers and the loop trip-count, but limited to a maximum safe VF.
1720	/// This is a helper function of computeFeasibleMaxVF.
1721	/// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1722	/// issue that occurred on one of the buildbots which cannot be reproduced
1723	/// without having access to the properietary compiler (see comments on
1724	/// D98509). The issue is currently under investigation and this workaround
1725	/// will be removed as soon as possible.
1726	ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1727	unsigned SmallestType,
1728	unsigned WidestType,
1729	const ElementCount &MaxSafeVF,
1730	bool FoldTailByMasking);
1731
1732	/// \return the maximum legal scalable VF, based on the safe max number
1733	/// of elements.
1734	ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1735
1736	/// The vectorization cost is a combination of the cost itself and a boolean
1737	/// indicating whether any of the contributing operations will actually
1738	/// operate on vector values after type legalization in the backend. If this
1739	/// latter value is false, then all operations will be scalarized (i.e. no
1740	/// vectorization has actually taken place).
1741	using VectorizationCostTy = std::pair<InstructionCost, bool>;
1742
1743	/// Returns the expected execution cost. The unit of the cost does
1744	/// not matter because we use the 'cost' units to compare different
1745	/// vector widths. The cost that is returned is not normalized by
1746	/// the factor width. If \p Invalid is not nullptr, this function
1747	/// will add a pair(Instruction*, ElementCount) to \p Invalid for
1748	/// each instruction that has an Invalid cost for the given VF.
1749	using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1750	VectorizationCostTy
1751	expectedCost(ElementCount VF,
1752	SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1753
1754	/// Returns the execution time cost of an instruction for a given vector
1755	/// width. Vector width of one means scalar.
1756	VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1757
1758	/// The cost-computation logic from getInstructionCost which provides
1759	/// the vector type as an output parameter.
1760	InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1761	Type *&VectorTy);
1762
1763	/// Return the cost of instructions in an inloop reduction pattern, if I is
1764	/// part of that pattern.
1765	Optional<InstructionCost>
1766	getReductionPatternCost(Instruction I, ElementCount VF, Type VectorTy,
1767	TTI::TargetCostKind CostKind);
1768
1769	/// Calculate vectorization cost of memory instruction \p I.
1770	InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1771
1772	/// The cost computation for scalarized memory instruction.
1773	InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1774
1775	/// The cost computation for interleaving group of memory instructions.
1776	InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1777
1778	/// The cost computation for Gather/Scatter instruction.
1779	InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1780
1781	/// The cost computation for widening instruction \p I with consecutive
1782	/// memory access.
1783	InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1784
1785	/// The cost calculation for Load/Store instruction \p I with uniform pointer -
1786	/// Load: scalar load + broadcast.
1787	/// Store: scalar store + (loop invariant value stored? 0 : extract of last
1788	/// element)
1789	InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1790
1791	/// Estimate the overhead of scalarizing an instruction. This is a
1792	/// convenience wrapper for the type-based getScalarizationOverhead API.
1793	InstructionCost getScalarizationOverhead(Instruction *I,
1794	ElementCount VF) const;
1795
1796	/// Returns whether the instruction is a load or store and will be a emitted
1797	/// as a vector operation.
1798	bool isConsecutiveLoadOrStore(Instruction *I);
1799
1800	/// Returns true if an artificially high cost for emulated masked memrefs
1801	/// should be used.
1802	bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1803
1804	/// Map of scalar integer values to the smallest bitwidth they can be legally
1805	/// represented as. The vector equivalents of these values should be truncated
1806	/// to this type.
1807	MapVector<Instruction *, uint64_t> MinBWs;
1808
1809	/// A type representing the costs for instructions if they were to be
1810	/// scalarized rather than vectorized. The entries are Instruction-Cost
1811	/// pairs.
1812	using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1813
1814	/// A set containing all BasicBlocks that are known to present after
1815	/// vectorization as a predicated block.
1816	SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1817
1818	/// Records whether it is allowed to have the original scalar loop execute at
1819	/// least once. This may be needed as a fallback loop in case runtime
1820	/// aliasing/dependence checks fail, or to handle the tail/remainder
1821	/// iterations when the trip count is unknown or doesn't divide by the VF,
1822	/// or as a peel-loop to handle gaps in interleave-groups.
1823	/// Under optsize and when the trip count is very small we don't allow any
1824	/// iterations to execute in the scalar loop.
1825	ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1826
1827	/// All blocks of loop are to be masked to fold tail of scalar iterations.
1828	bool FoldTailByMasking = false;
1829
1830	/// A map holding scalar costs for different vectorization factors. The
1831	/// presence of a cost for an instruction in the mapping indicates that the
1832	/// instruction will be scalarized when vectorizing with the associated
1833	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
1834	DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1835
1836	/// Holds the instructions known to be uniform after vectorization.
1837	/// The data is collected per VF.
1838	DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1839
1840	/// Holds the instructions known to be scalar after vectorization.
1841	/// The data is collected per VF.
1842	DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1843
1844	/// Holds the instructions (address computations) that are forced to be
1845	/// scalarized.
1846	DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1847
1848	/// PHINodes of the reductions that should be expanded in-loop along with
1849	/// their associated chains of reduction operations, in program order from top
1850	/// (PHI) to bottom
1851	ReductionChainMap InLoopReductionChains;
1852
1853	/// A Map of inloop reduction operations and their immediate chain operand.
1854	/// FIXME: This can be removed once reductions can be costed correctly in
1855	/// vplan. This was added to allow quick lookup to the inloop operations,
1856	/// without having to loop through InLoopReductionChains.
1857	DenseMap<Instruction , Instruction > InLoopReductionImmediateChains;
1858
1859	/// Returns the expected difference in cost from scalarizing the expression
1860	/// feeding a predicated instruction \p PredInst. The instructions to
1861	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
1862	/// non-negative return value implies the expression will be scalarized.
1863	/// Currently, only single-use chains are considered for scalarization.
1864	int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1865	ElementCount VF);
1866
1867	/// Collect the instructions that are uniform after vectorization. An
1868	/// instruction is uniform if we represent it with a single scalar value in
1869	/// the vectorized loop corresponding to each vector iteration. Examples of
1870	/// uniform instructions include pointer operands of consecutive or
1871	/// interleaved memory accesses. Note that although uniformity implies an
1872	/// instruction will be scalar, the reverse is not true. In general, a
1873	/// scalarized instruction will be represented by VF scalar values in the
1874	/// vectorized loop, each corresponding to an iteration of the original
1875	/// scalar loop.
1876	void collectLoopUniforms(ElementCount VF);
1877
1878	/// Collect the instructions that are scalar after vectorization. An
1879	/// instruction is scalar if it is known to be uniform or will be scalarized
1880	/// during vectorization. collectLoopScalars should only add non-uniform nodes
1881	/// to the list if they are used by a load/store instruction that is marked as
1882	/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1883	/// VF values in the vectorized loop, each corresponding to an iteration of
1884	/// the original scalar loop.
1885	void collectLoopScalars(ElementCount VF);
1886
1887	/// Keeps cost model vectorization decision and cost for instructions.
1888	/// Right now it is used for memory instructions only.
1889	using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1890	std::pair<InstWidening, InstructionCost>>;
1891
1892	DecisionList WideningDecisions;
1893
1894	/// Returns true if \p V is expected to be vectorized and it needs to be
1895	/// extracted.
1896	bool needsExtract(Value *V, ElementCount VF) const {
1897	Instruction *I = dyn_cast<Instruction>(V);
1898	if (VF.isScalar() \|\| !I \|\| !TheLoop->contains(I) \|\|
1899	TheLoop->isLoopInvariant(I))
1900	return false;
1901
1902	// Assume we can vectorize V (and hence we need extraction) if the
1903	// scalars are not computed yet. This can happen, because it is called
1904	// via getScalarizationOverhead from setCostBasedWideningDecision, before
1905	// the scalars are collected. That should be a safe assumption in most
1906	// cases, because we check if the operands have vectorizable types
1907	// beforehand in LoopVectorizationLegality.
1908	return Scalars.find(VF) == Scalars.end() \|\|
1909	!isScalarAfterVectorization(I, VF);
1910	};
1911
1912	/// Returns a range containing only operands needing to be extracted.
1913	SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1914	ElementCount VF) const {
1915	return SmallVector<Value *, 4>(make_filter_range(
1916	Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1917	}
1918
1919	/// Determines if we have the infrastructure to vectorize loop \p L and its
1920	/// epilogue, assuming the main loop is vectorized by \p VF.
1921	bool isCandidateForEpilogueVectorization(const Loop &L,
1922	const ElementCount VF) const;
1923
1924	/// Returns true if epilogue vectorization is considered profitable, and
1925	/// false otherwise.
1926	/// \p VF is the vectorization factor chosen for the original loop.
1927	bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1928
1929	public:
1930	/// The loop that we evaluate.
1931	Loop *TheLoop;
1932
1933	/// Predicated scalar evolution analysis.
1934	PredicatedScalarEvolution &PSE;
1935
1936	/// Loop Info analysis.
1937	LoopInfo *LI;
1938
1939	/// Vectorization legality.
1940	LoopVectorizationLegality *Legal;
1941
1942	/// Vector target information.
1943	const TargetTransformInfo &TTI;
1944
1945	/// Target Library Info.
1946	const TargetLibraryInfo *TLI;
1947
1948	/// Demanded bits analysis.
1949	DemandedBits *DB;
1950
1951	/// Assumption cache.
1952	AssumptionCache *AC;
1953
1954	/// Interface to emit optimization remarks.
1955	OptimizationRemarkEmitter *ORE;
1956
1957	const Function *TheFunction;
1958
1959	/// Loop Vectorize Hint.
1960	const LoopVectorizeHints *Hints;
1961
1962	/// The interleave access information contains groups of interleaved accesses
1963	/// with the same stride and close to each other.
1964	InterleavedAccessInfo &InterleaveInfo;
1965
1966	/// Values to ignore in the cost model.
1967	SmallPtrSet<const Value *, 16> ValuesToIgnore;
1968
1969	/// Values to ignore in the cost model when VF > 1.
1970	SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1971
1972	/// All element types found in the loop.
1973	SmallPtrSet<Type *, 16> ElementTypesInLoop;
1974
1975	/// Profitable vector factors.
1976	SmallVector<VectorizationFactor, 8> ProfitableVFs;
1977	};
1978	} // end namespace llvm
1979
1980	/// Helper struct to manage generating runtime checks for vectorization.
1981	///
1982	/// The runtime checks are created up-front in temporary blocks to allow better
1983	/// estimating the cost and un-linked from the existing IR. After deciding to
1984	/// vectorize, the checks are moved back. If deciding not to vectorize, the
1985	/// temporary blocks are completely removed.
1986	class GeneratedRTChecks {
1987	/// Basic block which contains the generated SCEV checks, if any.
1988	BasicBlock *SCEVCheckBlock = nullptr;
1989
1990	/// The value representing the result of the generated SCEV checks. If it is
1991	/// nullptr, either no SCEV checks have been generated or they have been used.
1992	Value *SCEVCheckCond = nullptr;
1993
1994	/// Basic block which contains the generated memory runtime checks, if any.
1995	BasicBlock *MemCheckBlock = nullptr;
1996
1997	/// The value representing the result of the generated memory runtime checks.
1998	/// If it is nullptr, either no memory runtime checks have been generated or
1999	/// they have been used.
2000	Value *MemRuntimeCheckCond = nullptr;
2001
2002	DominatorTree *DT;
2003	LoopInfo *LI;
2004
2005	SCEVExpander SCEVExp;
2006	SCEVExpander MemCheckExp;
2007
2008	public:
2009	GeneratedRTChecks(ScalarEvolution &SE, DominatorTree DT, LoopInfo LI,
2010	const DataLayout &DL)
2011	: DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2012	MemCheckExp(SE, DL, "scev.check") {}
2013
2014	/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2015	/// accurately estimate the cost of the runtime checks. The blocks are
2016	/// un-linked from the IR and is added back during vector code generation. If
2017	/// there is no vector code generation, the check blocks are removed
2018	/// completely.
2019	void Create(Loop *L, const LoopAccessInfo &LAI,
2020	const SCEVUnionPredicate &UnionPred) {
2021
2022	BasicBlock *LoopHeader = L->getHeader();
2023	BasicBlock *Preheader = L->getLoopPreheader();
2024
2025	// Use SplitBlock to create blocks for SCEV & memory runtime checks to
2026	// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2027	// may be used by SCEVExpander. The blocks will be un-linked from their
2028	// predecessors and removed from LI & DT at the end of the function.
2029	if (!UnionPred.isAlwaysTrue()) {
2030	SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2031	nullptr, "vector.scevcheck");
2032
2033	SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2034	&UnionPred, SCEVCheckBlock->getTerminator());
2035	}
2036
2037	const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2038	if (RtPtrChecking.Need) {
2039	auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2040	MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2041	"vector.memcheck");
2042
2043	MemRuntimeCheckCond =
2044	addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2045	RtPtrChecking.getChecks(), MemCheckExp);
2046	assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2048, __extension__ __PRETTY_FUNCTION__))
2047	"no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2048, __extension__ __PRETTY_FUNCTION__))
2048	"claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2048, __extension__ __PRETTY_FUNCTION__));
2049	}
2050
2051	if (!MemCheckBlock && !SCEVCheckBlock)
2052	return;
2053
2054	// Unhook the temporary block with the checks, update various places
2055	// accordingly.
2056	if (SCEVCheckBlock)
2057	SCEVCheckBlock->replaceAllUsesWith(Preheader);
2058	if (MemCheckBlock)
2059	MemCheckBlock->replaceAllUsesWith(Preheader);
2060
2061	if (SCEVCheckBlock) {
2062	SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2063	new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2064	Preheader->getTerminator()->eraseFromParent();
2065	}
2066	if (MemCheckBlock) {
2067	MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2068	new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2069	Preheader->getTerminator()->eraseFromParent();
2070	}
2071
2072	DT->changeImmediateDominator(LoopHeader, Preheader);
2073	if (MemCheckBlock) {
2074	DT->eraseNode(MemCheckBlock);
2075	LI->removeBlock(MemCheckBlock);
2076	}
2077	if (SCEVCheckBlock) {
2078	DT->eraseNode(SCEVCheckBlock);
2079	LI->removeBlock(SCEVCheckBlock);
2080	}
2081	}
2082
2083	/// Remove the created SCEV & memory runtime check blocks & instructions, if
2084	/// unused.
2085	~GeneratedRTChecks() {
2086	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2087	SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2088	if (!SCEVCheckCond)
2089	SCEVCleaner.markResultUsed();
2090
2091	if (!MemRuntimeCheckCond)
2092	MemCheckCleaner.markResultUsed();
2093
2094	if (MemRuntimeCheckCond) {
2095	auto &SE = *MemCheckExp.getSE();
2096	// Memory runtime check generation creates compares that use expanded
2097	// values. Remove them before running the SCEVExpanderCleaners.
2098	for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2099	if (MemCheckExp.isInsertedInstruction(&I))
2100	continue;
2101	SE.forgetValue(&I);
2102	I.eraseFromParent();
2103	}
2104	}
2105	MemCheckCleaner.cleanup();
2106	SCEVCleaner.cleanup();
2107
2108	if (SCEVCheckCond)
2109	SCEVCheckBlock->eraseFromParent();
2110	if (MemRuntimeCheckCond)
2111	MemCheckBlock->eraseFromParent();
2112	}
2113
2114	/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2115	/// adjusts the branches to branch to the vector preheader or \p Bypass,
2116	/// depending on the generated condition.
2117	BasicBlock emitSCEVChecks(Loop L, BasicBlock *Bypass,
2118	BasicBlock *LoopVectorPreHeader,
2119	BasicBlock *LoopExitBlock) {
2120	if (!SCEVCheckCond)
2121	return nullptr;
2122	if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2123	if (C->isZero())
2124	return nullptr;
2125
2126	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2127
2128	BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2129	// Create new preheader for vector loop.
2130	if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2131	PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2132
2133	SCEVCheckBlock->getTerminator()->eraseFromParent();
2134	SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2135	Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2136	SCEVCheckBlock);
2137
2138	DT->addNewBlock(SCEVCheckBlock, Pred);
2139	DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2140
2141	ReplaceInstWithInst(
2142	SCEVCheckBlock->getTerminator(),
2143	BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2144	// Mark the check as used, to prevent it from being removed during cleanup.
2145	SCEVCheckCond = nullptr;
2146	return SCEVCheckBlock;
2147	}
2148
2149	/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2150	/// the branches to branch to the vector preheader or \p Bypass, depending on
2151	/// the generated condition.
2152	BasicBlock emitMemRuntimeChecks(Loop L, BasicBlock *Bypass,
2153	BasicBlock *LoopVectorPreHeader) {
2154	// Check if we generated code that checks in runtime if arrays overlap.
2155	if (!MemRuntimeCheckCond)
2156	return nullptr;
2157
2158	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2159	Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2160	MemCheckBlock);
2161
2162	DT->addNewBlock(MemCheckBlock, Pred);
2163	DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2164	MemCheckBlock->moveBefore(LoopVectorPreHeader);
2165
2166	if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2167	PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2168
2169	ReplaceInstWithInst(
2170	MemCheckBlock->getTerminator(),
2171	BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2172	MemCheckBlock->getTerminator()->setDebugLoc(
2173	Pred->getTerminator()->getDebugLoc());
2174
2175	// Mark the check as used, to prevent it from being removed during cleanup.
2176	MemRuntimeCheckCond = nullptr;
2177	return MemCheckBlock;
2178	}
2179	};
2180
2181	// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2182	// vectorization. The loop needs to be annotated with #pragma omp simd
2183	// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2184	// vector length information is not provided, vectorization is not considered
2185	// explicit. Interleave hints are not allowed either. These limitations will be
2186	// relaxed in the future.
2187	// Please, note that we are currently forced to abuse the pragma 'clang
2188	// vectorize' semantics. This pragma provides auto-vectorization hints
2189	// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2190	// provides explicit vectorization hints (LV can bypass legal checks and
2191	// assume that vectorization is legal). However, both hints are implemented
2192	// using the same metadata (llvm.loop.vectorize, processed by
2193	// LoopVectorizeHints). This will be fixed in the future when the native IR
2194	// representation for pragma 'omp simd' is introduced.
2195	static bool isExplicitVecOuterLoop(Loop *OuterLp,
2196	OptimizationRemarkEmitter *ORE) {
2197	assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() && "This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2197, __extension__ __PRETTY_FUNCTION__));
2198	LoopVectorizeHints Hints(OuterLp, true /DisableInterleaving/, *ORE);
2199
2200	// Only outer loops with an explicit vectorization hint are supported.
2201	// Unannotated outer loops are ignored.
2202	if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2203	return false;
2204
2205	Function *Fn = OuterLp->getHeader()->getParent();
2206	if (!Hints.allowVectorization(Fn, OuterLp,
2207	true /VectorizeOnlyWhenForced/)) {
2208	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n" ; } } while (false);
2209	return false;
2210	}
2211
2212	if (Hints.getInterleave() > 1) {
2213	// TODO: Interleave support is future work.
2214	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"; } } while (false)
2215	"outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for " "outer loops.\n"; } } while (false);
2216	Hints.emitRemarkWithHints();
2217	return false;
2218	}
2219
2220	return true;
2221	}
2222
2223	static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2224	OptimizationRemarkEmitter *ORE,
2225	SmallVectorImpl<Loop *> &V) {
2226	// Collect inner loops and outer loops without irreducible control flow. For
2227	// now, only collect outer loops that have explicit vectorization hints. If we
2228	// are stress testing the VPlan H-CFG construction, we collect the outermost
2229	// loop of every loop nest.
2230	if (L.isInnermost() \|\| VPlanBuildStressTest \|\|
2231	(EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2232	LoopBlocksRPO RPOT(&L);
2233	RPOT.perform(LI);
2234	if (!containsIrreducibleCFG<const BasicBlock >(RPOT, LI)) {
2235	V.push_back(&L);
2236	// TODO: Collect inner loops inside marked outer loops in case
2237	// vectorization fails for the outer loop. Do not invoke
2238	// 'containsIrreducibleCFG' again for inner loops when the outer loop is
2239	// already known to be reducible. We can use an inherited attribute for
2240	// that.
2241	return;
2242	}
2243	}
2244	for (Loop *InnerL : L)
2245	collectSupportedLoops(*InnerL, LI, ORE, V);
2246	}
2247
2248	namespace {
2249
2250	/// The LoopVectorize Pass.
2251	struct LoopVectorize : public FunctionPass {
2252	/// Pass identification, replacement for typeid
2253	static char ID;
2254
2255	LoopVectorizePass Impl;
2256
2257	explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2258	bool VectorizeOnlyWhenForced = false)
2259	: FunctionPass(ID),
2260	Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2261	initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2262	}
2263
2264	bool runOnFunction(Function &F) override {
2265	if (skipFunction(F))
2266	return false;
2267
2268	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2269	auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2270	auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2271	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2272	auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2273	auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2274	auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2275	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2276	auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2277	auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2278	auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2279	auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2280	auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2281
2282	std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2283	[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2284
2285	return Impl.runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC,
2286	GetLAA, *ORE, PSI).MadeAnyChange;
2287	}
2288
2289	void getAnalysisUsage(AnalysisUsage &AU) const override {
2290	AU.addRequired<AssumptionCacheTracker>();
2291	AU.addRequired<BlockFrequencyInfoWrapperPass>();
2292	AU.addRequired<DominatorTreeWrapperPass>();
2293	AU.addRequired<LoopInfoWrapperPass>();
2294	AU.addRequired<ScalarEvolutionWrapperPass>();
2295	AU.addRequired<TargetTransformInfoWrapperPass>();
2296	AU.addRequired<AAResultsWrapperPass>();
2297	AU.addRequired<LoopAccessLegacyAnalysis>();
2298	AU.addRequired<DemandedBitsWrapperPass>();
2299	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2300	AU.addRequired<InjectTLIMappingsLegacy>();
2301
2302	// We currently do not preserve loopinfo/dominator analyses with outer loop
2303	// vectorization. Until this is addressed, mark these analyses as preserved
2304	// only for non-VPlan-native path.
2305	// TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2306	if (!EnableVPlanNativePath) {
2307	AU.addPreserved<LoopInfoWrapperPass>();
2308	AU.addPreserved<DominatorTreeWrapperPass>();
2309	}
2310
2311	AU.addPreserved<BasicAAWrapperPass>();
2312	AU.addPreserved<GlobalsAAWrapperPass>();
2313	AU.addRequired<ProfileSummaryInfoWrapperPass>();
2314	}
2315	};
2316
2317	} // end anonymous namespace
2318
2319	//===----------------------------------------------------------------------===//
2320	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2321	// LoopVectorizationCostModel and LoopVectorizationPlanner.
2322	//===----------------------------------------------------------------------===//
2323
2324	Value InnerLoopVectorizer::getBroadcastInstrs(Value V) {
2325	// We need to place the broadcast of invariant variables outside the loop,
2326	// but only if it's proven safe to do so. Else, broadcast will be inside
2327	// vector loop body.
2328	Instruction *Instr = dyn_cast<Instruction>(V);
2329	bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2330	(!Instr \|\|
2331	DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2332	// Place the code for broadcasting invariant variables in the new preheader.
2333	IRBuilder<>::InsertPointGuard Guard(Builder);
2334	if (SafeToHoist)
2335	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2336
2337	// Broadcast the scalar into all locations in the vector.
2338	Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2339
2340	return Shuf;
2341	}
2342
2343	/// This function adds
2344	/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2345	/// to each vector element of Val. The sequence starts at StartIndex.
2346	/// \p Opcode is relevant for FP induction variable.
2347	static Value getStepVector(Value Val, Value StartIdx, Value Step,
2348	Instruction::BinaryOps BinOp, ElementCount VF,
2349	IRBuilder<> &Builder) {
2350	assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported" ) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2350, __extension__ __PRETTY_FUNCTION__));
2351
2352	// Create and check the types.
2353	auto *ValVTy = cast<VectorType>(Val->getType());
2354	ElementCount VLen = ValVTy->getElementCount();
2355
2356	Type *STy = Val->getType()->getScalarType();
2357	assert((STy->isIntegerTy() \|\| STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() \|\| STy-> isFloatingPointTy()) && "Induction Step must be an integer or FP" ) ? void (0) : __assert_fail ("(STy->isIntegerTy() \|\| STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2358, __extension__ __PRETTY_FUNCTION__))
2358	"Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() \|\| STy-> isFloatingPointTy()) && "Induction Step must be an integer or FP" ) ? void (0) : __assert_fail ("(STy->isIntegerTy() \|\| STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2358, __extension__ __PRETTY_FUNCTION__));
2359	assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy && "Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2359, __extension__ __PRETTY_FUNCTION__));
2360
2361	SmallVector<Constant *, 8> Indices;
2362
2363	// Create a vector of consecutive numbers from zero to VF.
2364	VectorType *InitVecValVTy = ValVTy;
2365	Type *InitVecValSTy = STy;
	Value stored to 'InitVecValSTy' during its initialization is never read
2366	if (STy->isFloatingPointTy()) {
2367	InitVecValSTy =
2368	IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2369	InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2370	}
2371	Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2372
2373	// Splat the StartIdx
2374	Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2375
2376	if (STy->isIntegerTy()) {
2377	InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2378	Step = Builder.CreateVectorSplat(VLen, Step);
2379	assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType () && "Invalid step vec") ? void (0) : __assert_fail ( "Step->getType() == Val->getType() && \"Invalid step vec\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2379, __extension__ __PRETTY_FUNCTION__));
2380	// FIXME: The newly created binary instructions should contain nsw/nuw
2381	// flags, which can be found from the original scalar operations.
2382	Step = Builder.CreateMul(InitVec, Step);
2383	return Builder.CreateAdd(Val, Step, "induction");
2384	}
2385
2386	// Floating point induction.
2387	assert((BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction" ) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2388, __extension__ __PRETTY_FUNCTION__))
2388	"Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) && "Binary Opcode should be specified for FP induction" ) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd \|\| BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2388, __extension__ __PRETTY_FUNCTION__));
2389	InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2390	InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2391
2392	Step = Builder.CreateVectorSplat(VLen, Step);
2393	Value *MulOp = Builder.CreateFMul(InitVec, Step);
2394	return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2395	}
2396
2397	void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2398	const InductionDescriptor &II, Value Step, Value Start,
2399	Instruction EntryVal, VPValue Def, VPTransformState &State) {
2400	IRBuilder<> &Builder = State.Builder;
2401	assert((isa<PHINode>(EntryVal) \|\| isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) \|\| isa <TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!" ) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) \|\| isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2402, __extension__ __PRETTY_FUNCTION__))
2402	"Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) \|\| isa <TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!" ) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) \|\| isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2402, __extension__ __PRETTY_FUNCTION__));
2403
2404	// Construct the initial value of the vector IV in the vector loop preheader
2405	auto CurrIP = Builder.saveIP();
2406	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2407	if (isa<TruncInst>(EntryVal)) {
2408	assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy () && "Truncation requires an integer type") ? void ( 0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2409, __extension__ __PRETTY_FUNCTION__))
2409	"Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy () && "Truncation requires an integer type") ? void ( 0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2409, __extension__ __PRETTY_FUNCTION__));
2410	auto *TruncType = cast<IntegerType>(EntryVal->getType());
2411	Step = Builder.CreateTrunc(Step, TruncType);
2412	Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2413	}
2414
2415	Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2416	Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2417	Value *SteppedStart = getStepVector(
2418	SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2419
2420	// We create vector phi nodes for both integer and floating-point induction
2421	// variables. Here, we determine the kind of arithmetic we will perform.
2422	Instruction::BinaryOps AddOp;
2423	Instruction::BinaryOps MulOp;
2424	if (Step->getType()->isIntegerTy()) {
2425	AddOp = Instruction::Add;
2426	MulOp = Instruction::Mul;
2427	} else {
2428	AddOp = II.getInductionOpcode();
2429	MulOp = Instruction::FMul;
2430	}
2431
2432	// Multiply the vectorization factor by the step using integer or
2433	// floating-point arithmetic as appropriate.
2434	Type *StepType = Step->getType();
2435	Value *RuntimeVF;
2436	if (Step->getType()->isFloatingPointTy())
2437	RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2438	else
2439	RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2440	Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2441
2442	// Create a vector splat to use in the induction update.
2443	//
2444	// FIXME: If the step is non-constant, we create the vector splat with
2445	// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2446	// handle a constant vector splat.
2447	Value *SplatVF = isa<Constant>(Mul)
2448	? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2449	: Builder.CreateVectorSplat(State.VF, Mul);
2450	Builder.restoreIP(CurrIP);
2451
2452	// We may need to add the step a number of times, depending on the unroll
2453	// factor. The last of those goes into the PHI.
2454	PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2455	&*LoopVectorBody->getFirstInsertionPt());
2456	VecInd->setDebugLoc(EntryVal->getDebugLoc());
2457	Instruction *LastInduction = VecInd;
2458	for (unsigned Part = 0; Part < UF; ++Part) {
2459	State.set(Def, LastInduction, Part);
2460
2461	if (isa<TruncInst>(EntryVal))
2462	addMetadata(LastInduction, EntryVal);
2463
2464	LastInduction = cast<Instruction>(
2465	Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2466	LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2467	}
2468
2469	// Move the last step to the end of the latch block. This ensures consistent
2470	// placement of all induction updates.
2471	auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2472	auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2473	LastInduction->moveBefore(Br);
2474	LastInduction->setName("vec.ind.next");
2475
2476	VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2477	VecInd->addIncoming(LastInduction, LoopVectorLatch);
2478	}
2479
2480	void InnerLoopVectorizer::widenIntOrFpInduction(
2481	PHINode IV, VPWidenIntOrFpInductionRecipe Def, VPTransformState &State,
2482	Value *CanonicalIV) {
2483	Value *Start = Def->getStartValue()->getLiveInIRValue();
2484	const InductionDescriptor &ID = Def->getInductionDescriptor();
2485	TruncInst *Trunc = Def->getTruncInst();
2486	IRBuilder<> &Builder = State.Builder;
2487	assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue ()->getType() && "Types must match") ? void (0) : __assert_fail ("IV->getType() == ID.getStartValue()->getType() && \"Types must match\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2487, __extension__ __PRETTY_FUNCTION__));
2488	assert(!State.VF.isZero() && "VF must be non-zero")(static_cast <bool> (!State.VF.isZero() && "VF must be non-zero" ) ? void (0) : __assert_fail ("!State.VF.isZero() && \"VF must be non-zero\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2488, __extension__ __PRETTY_FUNCTION__));
2489
2490	// The value from the original loop to which we are mapping the new induction
2491	// variable.
2492	Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2493
2494	auto &DL = EntryVal->getModule()->getDataLayout();
2495
2496	// Generate code for the induction step. Note that induction steps are
2497	// required to be loop-invariant
2498	auto CreateStepValue = [&](const SCEV Step) -> Value {
2499	assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step , OrigLoop) && "Induction step should be loop invariant" ) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2500, __extension__ __PRETTY_FUNCTION__))
2500	"Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step , OrigLoop) && "Induction step should be loop invariant" ) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2500, __extension__ __PRETTY_FUNCTION__));
2501	if (PSE.getSE()->isSCEVable(IV->getType())) {
2502	SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2503	return Exp.expandCodeFor(Step, Step->getType(),
2504	State.CFG.VectorPreHeader->getTerminator());
2505	}
2506	return cast<SCEVUnknown>(Step)->getValue();
2507	};
2508
2509	// The scalar value to broadcast. This is derived from the canonical
2510	// induction variable. If a truncation type is given, truncate the canonical
2511	// induction variable and step. Otherwise, derive these values from the
2512	// induction descriptor.
2513	auto CreateScalarIV = [&](Value &Step) -> Value {
2514	Value *ScalarIV = CanonicalIV;
2515	Type *NeededType = IV->getType();
2516	if (!Def->isCanonical() \|\| ScalarIV->getType() != NeededType) {
2517	ScalarIV =
2518	NeededType->isIntegerTy()
2519	? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
2520	: Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
2521	ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2522	State.CFG.PrevBB);
2523	ScalarIV->setName("offset.idx");
2524	}
2525	if (Trunc) {
2526	auto *TruncType = cast<IntegerType>(Trunc->getType());
2527	assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy () && "Truncation requires an integer step") ? void ( 0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2528, __extension__ __PRETTY_FUNCTION__))
2528	"Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy () && "Truncation requires an integer step") ? void ( 0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2528, __extension__ __PRETTY_FUNCTION__));
2529	ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2530	Step = Builder.CreateTrunc(Step, TruncType);
2531	}
2532	return ScalarIV;
2533	};
2534
2535	// Fast-math-flags propagate from the original induction instruction.
2536	IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2537	if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2538	Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2539
2540	// Now do the actual transformations, and start with creating the step value.
2541	Value *Step = CreateStepValue(ID.getStep());
2542	if (State.VF.isScalar()) {
2543	Value *ScalarIV = CreateScalarIV(Step);
2544	Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
2545	Step->getType()->getScalarSizeInBits());
2546
2547	Instruction::BinaryOps IncOp = ID.getInductionOpcode();
2548	if (IncOp == Instruction::BinaryOpsEnd)
2549	IncOp = Instruction::Add;
2550	for (unsigned Part = 0; Part < UF; ++Part) {
2551	Value *StartIdx = ConstantInt::get(ScalarTy, Part);
2552	Instruction::BinaryOps MulOp = Instruction::Mul;
2553	if (Step->getType()->isFloatingPointTy()) {
2554	StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
2555	MulOp = Instruction::FMul;
2556	}
2557
2558	Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2559	Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
2560	State.set(Def, EntryPart, Part);
2561	if (Trunc) {
2562	assert(!Step->getType()->isFloatingPointTy() &&(static_cast <bool> (!Step->getType()->isFloatingPointTy () && "fp inductions shouldn't be truncated") ? void ( 0) : __assert_fail ("!Step->getType()->isFloatingPointTy() && \"fp inductions shouldn't be truncated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2563, __extension__ __PRETTY_FUNCTION__))
2563	"fp inductions shouldn't be truncated")(static_cast <bool> (!Step->getType()->isFloatingPointTy () && "fp inductions shouldn't be truncated") ? void ( 0) : __assert_fail ("!Step->getType()->isFloatingPointTy() && \"fp inductions shouldn't be truncated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2563, __extension__ __PRETTY_FUNCTION__));
2564	addMetadata(EntryPart, Trunc);
2565	}
2566	}
2567	return;
2568	}
2569
2570	// Create a new independent vector induction variable, if one is needed.
2571	if (Def->needsVectorIV())
2572	createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2573
2574	if (Def->needsScalarIV()) {
2575	// Create scalar steps that can be used by instructions we will later
2576	// scalarize. Note that the addition of the scalar steps will not increase
2577	// the number of instructions in the loop in the common case prior to
2578	// InstCombine. We will be trading one vector extract for each scalar step.
2579	Value *ScalarIV = CreateScalarIV(Step);
2580	buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2581	}
2582	}
2583
2584	void InnerLoopVectorizer::buildScalarSteps(Value ScalarIV, Value Step,
2585	Instruction *EntryVal,
2586	const InductionDescriptor &ID,
2587	VPValue *Def,
2588	VPTransformState &State) {
2589	IRBuilder<> &Builder = State.Builder;
2590	// We shouldn't have to build scalar steps if we aren't vectorizing.
2591	assert(State.VF.isVector() && "VF should be greater than one")(static_cast <bool> (State.VF.isVector() && "VF should be greater than one" ) ? void (0) : __assert_fail ("State.VF.isVector() && \"VF should be greater than one\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2591, __extension__ __PRETTY_FUNCTION__));
2592	// Get the value type and ensure it and the step have the same integer type.
2593	Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2594	assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() && "Val and Step should have the same type") ? void (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2595, __extension__ __PRETTY_FUNCTION__))
2595	"Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() && "Val and Step should have the same type") ? void (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2595, __extension__ __PRETTY_FUNCTION__));
2596
2597	// We build scalar steps for both integer and floating-point induction
2598	// variables. Here, we determine the kind of arithmetic we will perform.
2599	Instruction::BinaryOps AddOp;
2600	Instruction::BinaryOps MulOp;
2601	if (ScalarIVTy->isIntegerTy()) {
2602	AddOp = Instruction::Add;
2603	MulOp = Instruction::Mul;
2604	} else {
2605	AddOp = ID.getInductionOpcode();
2606	MulOp = Instruction::FMul;
2607	}
2608
2609	// Determine the number of scalars we need to generate for each unroll
2610	// iteration.
2611	bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2612	unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2613	// Compute the scalar steps and save the results in State.
2614	Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2615	ScalarIVTy->getScalarSizeInBits());
2616	Type *VecIVTy = nullptr;
2617	Value UnitStepVec = nullptr, SplatStep = nullptr, *SplatIV = nullptr;
2618	if (!FirstLaneOnly && State.VF.isScalable()) {
2619	VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2620	UnitStepVec =
2621	Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2622	SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2623	SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2624	}
2625
2626	for (unsigned Part = 0; Part < State.UF; ++Part) {
2627	Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2628
2629	if (!FirstLaneOnly && State.VF.isScalable()) {
2630	auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2631	auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2632	if (ScalarIVTy->isFloatingPointTy())
2633	InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2634	auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2635	auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2636	State.set(Def, Add, Part);
2637	// It's useful to record the lane values too for the known minimum number
2638	// of elements so we do those below. This improves the code quality when
2639	// trying to extract the first element, for example.
2640	}
2641
2642	if (ScalarIVTy->isFloatingPointTy())
2643	StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2644
2645	for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2646	Value *StartIdx = Builder.CreateBinOp(
2647	AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2648	// The step returned by `createStepForVF` is a runtime-evaluated value
2649	// when VF is scalable. Otherwise, it should be folded into a Constant.
2650	assert((State.VF.isScalable() \|\| isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() \|\| isa<Constant >(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() \|\| isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2652, __extension__ __PRETTY_FUNCTION__))
2651	"Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() \|\| isa<Constant >(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() \|\| isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2652, __extension__ __PRETTY_FUNCTION__))
2652	"scalable")(static_cast <bool> ((State.VF.isScalable() \|\| isa<Constant >(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() \|\| isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2652, __extension__ __PRETTY_FUNCTION__));
2653	auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2654	auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2655	State.set(Def, Add, VPIteration(Part, Lane));
2656	}
2657	}
2658	}
2659
2660	void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2661	const VPIteration &Instance,
2662	VPTransformState &State) {
2663	Value *ScalarInst = State.get(Def, Instance);
2664	Value *VectorValue = State.get(Def, Instance.Part);
2665	VectorValue = Builder.CreateInsertElement(
2666	VectorValue, ScalarInst,
2667	Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2668	State.set(Def, VectorValue, Instance.Part);
2669	}
2670
2671	// Return whether we allow using masked interleave-groups (for dealing with
2672	// strided loads/stores that reside in predicated blocks, or for dealing
2673	// with gaps).
2674	static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2675	// If an override option has been passed in for interleaved accesses, use it.
2676	if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2677	return EnableMaskedInterleavedMemAccesses;
2678
2679	return TTI.enableMaskedInterleavedAccessVectorization();
2680	}
2681
2682	// Try to vectorize the interleave group that \p Instr belongs to.
2683	//
2684	// E.g. Translate following interleaved load group (factor = 3):
2685	// for (i = 0; i < N; i+=3) {
2686	// R = Pic[i]; // Member of index 0
2687	// G = Pic[i+1]; // Member of index 1
2688	// B = Pic[i+2]; // Member of index 2
2689	// ... // do something to R, G, B
2690	// }
2691	// To:
2692	// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2693	// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2694	// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2695	// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2696	//
2697	// Or translate following interleaved store group (factor = 3):
2698	// for (i = 0; i < N; i+=3) {
2699	// ... do something to R, G, B
2700	// Pic[i] = R; // Member of index 0
2701	// Pic[i+1] = G; // Member of index 1
2702	// Pic[i+2] = B; // Member of index 2
2703	// }
2704	// To:
2705	// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2706	// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2707	// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2708	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2709	// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2710	void InnerLoopVectorizer::vectorizeInterleaveGroup(
2711	const InterleaveGroup<Instruction> Group, ArrayRef<VPValue > VPDefs,
2712	VPTransformState &State, VPValue Addr, ArrayRef<VPValue > StoredValues,
2713	VPValue *BlockInMask) {
2714	Instruction *Instr = Group->getInsertPos();
2715	const DataLayout &DL = Instr->getModule()->getDataLayout();
2716
2717	// Prepare for the vector type of the interleaved load/store.
2718	Type *ScalarTy = getLoadStoreType(Instr);
2719	unsigned InterleaveFactor = Group->getFactor();
2720	assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported." ) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2720, __extension__ __PRETTY_FUNCTION__));
2721	auto VecTy = VectorType::get(ScalarTy, VF InterleaveFactor);
2722
2723	// Prepare for the new pointers.
2724	SmallVector<Value *, 2> AddrParts;
2725	unsigned Index = Group->getIndex(Instr);
2726
2727	// TODO: extend the masked interleaved-group support to reversed access.
2728	assert((!BlockInMask \|\| !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask \|\| !Group->isReverse ()) && "Reversed masked interleave-group not supported." ) ? void (0) : __assert_fail ("(!BlockInMask \|\| !Group->isReverse()) && \"Reversed masked interleave-group not supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__ __PRETTY_FUNCTION__))
2729	"Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask \|\| !Group->isReverse ()) && "Reversed masked interleave-group not supported." ) ? void (0) : __assert_fail ("(!BlockInMask \|\| !Group->isReverse()) && \"Reversed masked interleave-group not supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__ __PRETTY_FUNCTION__));
2730
2731	// If the group is reverse, adjust the index to refer to the last vector lane
2732	// instead of the first. We adjust the index from the first vector lane,
2733	// rather than directly getting the pointer for lane VF - 1, because the
2734	// pointer operand of the interleaved access is supposed to be uniform. For
2735	// uniform instructions, we're only required to generate a value for the
2736	// first vector lane in each unroll iteration.
2737	if (Group->isReverse())
2738	Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2739
2740	for (unsigned Part = 0; Part < UF; Part++) {
2741	Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2742	setDebugLocFromInst(AddrPart);
2743
2744	// Notice current instruction could be any index. Need to adjust the address
2745	// to the member of index 0.
2746	//
2747	// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2748	// b = A[i]; // Member of index 0
2749	// Current pointer is pointed to A[i+1], adjust it to A[i].
2750	//
2751	// E.g. A[i+1] = a; // Member of index 1
2752	// A[i] = b; // Member of index 0
2753	// A[i+2] = c; // Member of index 2 (Current instruction)
2754	// Current pointer is pointed to A[i+2], adjust it to A[i].
2755
2756	bool InBounds = false;
2757	if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2758	InBounds = gep->isInBounds();
2759	AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2760	cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2761
2762	// Cast to the vector pointer type.
2763	unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2764	Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2765	AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2766	}
2767
2768	setDebugLocFromInst(Instr);
2769	Value *PoisonVec = PoisonValue::get(VecTy);
2770
2771	Value *MaskForGaps = nullptr;
2772	if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2773	MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2774	assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null" ) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2774, __extension__ __PRETTY_FUNCTION__));
2775	}
2776
2777	// Vectorize the interleaved load group.
2778	if (isa<LoadInst>(Instr)) {
2779	// For each unroll part, create a wide load for the group.
2780	SmallVector<Value *, 2> NewLoads;
2781	for (unsigned Part = 0; Part < UF; Part++) {
2782	Instruction *NewLoad;
2783	if (BlockInMask \|\| MaskForGaps) {
2784	assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) && "masked interleaved groups are not allowed.") ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2785, __extension__ __PRETTY_FUNCTION__))
2785	"masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) && "masked interleaved groups are not allowed.") ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"masked interleaved groups are not allowed.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2785, __extension__ __PRETTY_FUNCTION__));
2786	Value *GroupMask = MaskForGaps;
2787	if (BlockInMask) {
2788	Value *BlockInMaskPart = State.get(BlockInMask, Part);
2789	Value *ShuffledMask = Builder.CreateShuffleVector(
2790	BlockInMaskPart,
2791	createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2792	"interleaved.mask");
2793	GroupMask = MaskForGaps
2794	? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2795	MaskForGaps)
2796	: ShuffledMask;
2797	}
2798	NewLoad =
2799	Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2800	GroupMask, PoisonVec, "wide.masked.vec");
2801	}
2802	else
2803	NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2804	Group->getAlign(), "wide.vec");
2805	Group->addMetadata(NewLoad);
2806	NewLoads.push_back(NewLoad);
2807	}
2808
2809	// For each member in the group, shuffle out the appropriate data from the
2810	// wide loads.
2811	unsigned J = 0;
2812	for (unsigned I = 0; I < InterleaveFactor; ++I) {
2813	Instruction *Member = Group->getMember(I);
2814
2815	// Skip the gaps in the group.
2816	if (!Member)
2817	continue;
2818
2819	auto StrideMask =
2820	createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2821	for (unsigned Part = 0; Part < UF; Part++) {
2822	Value *StridedVec = Builder.CreateShuffleVector(
2823	NewLoads[Part], StrideMask, "strided.vec");
2824
2825	// If this member has different type, cast the result type.
2826	if (Member->getType() != ScalarTy) {
2827	assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable." ) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2827, __extension__ __PRETTY_FUNCTION__));
2828	VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2829	StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2830	}
2831
2832	if (Group->isReverse())
2833	StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2834
2835	State.set(VPDefs[J], StridedVec, Part);
2836	}
2837	++J;
2838	}
2839	return;
2840	}
2841
2842	// The sub vector type for current instruction.
2843	auto *SubVT = VectorType::get(ScalarTy, VF);
2844
2845	// Vectorize the interleaved store group.
2846	MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2847	assert((!MaskForGaps \|\| useMaskedInterleavedAccesses(TTI)) &&(static_cast <bool> ((!MaskForGaps \|\| useMaskedInterleavedAccesses (TTI)) && "masked interleaved groups are not allowed." ) ? void (0) : __assert_fail ("(!MaskForGaps \|\| useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2848, __extension__ __PRETTY_FUNCTION__))
2848	"masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps \|\| useMaskedInterleavedAccesses (TTI)) && "masked interleaved groups are not allowed." ) ? void (0) : __assert_fail ("(!MaskForGaps \|\| useMaskedInterleavedAccesses(TTI)) && \"masked interleaved groups are not allowed.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2848, __extension__ __PRETTY_FUNCTION__));
2849	assert((!MaskForGaps \|\| !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps \|\| !VF.isScalable()) && "masking gaps for scalable vectors is not yet supported." ) ? void (0) : __assert_fail ("(!MaskForGaps \|\| !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2850, __extension__ __PRETTY_FUNCTION__))
2850	"masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps \|\| !VF.isScalable()) && "masking gaps for scalable vectors is not yet supported." ) ? void (0) : __assert_fail ("(!MaskForGaps \|\| !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2850, __extension__ __PRETTY_FUNCTION__));
2851	for (unsigned Part = 0; Part < UF; Part++) {
2852	// Collect the stored vector from each member.
2853	SmallVector<Value *, 4> StoredVecs;
2854	for (unsigned i = 0; i < InterleaveFactor; i++) {
2855	assert((Group->getMember(i) \|\| MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) \|\| MaskForGaps ) && "Fail to get a member from an interleaved store group" ) ? void (0) : __assert_fail ("(Group->getMember(i) \|\| MaskForGaps) && \"Fail to get a member from an interleaved store group\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2856, __extension__ __PRETTY_FUNCTION__))
2856	"Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) \|\| MaskForGaps ) && "Fail to get a member from an interleaved store group" ) ? void (0) : __assert_fail ("(Group->getMember(i) \|\| MaskForGaps) && \"Fail to get a member from an interleaved store group\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2856, __extension__ __PRETTY_FUNCTION__));
2857	Instruction *Member = Group->getMember(i);
2858
2859	// Skip the gaps in the group.
2860	if (!Member) {
2861	Value *Undef = PoisonValue::get(SubVT);
2862	StoredVecs.push_back(Undef);
2863	continue;
2864	}
2865
2866	Value *StoredVec = State.get(StoredValues[i], Part);
2867
2868	if (Group->isReverse())
2869	StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2870
2871	// If this member has different type, cast it to a unified type.
2872
2873	if (StoredVec->getType() != SubVT)
2874	StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2875
2876	StoredVecs.push_back(StoredVec);
2877	}
2878
2879	// Concatenate all vectors into a wide vector.
2880	Value *WideVec = concatenateVectors(Builder, StoredVecs);
2881
2882	// Interleave the elements in the wide vector.
2883	Value *IVec = Builder.CreateShuffleVector(
2884	WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2885	"interleaved.vec");
2886
2887	Instruction *NewStoreInstr;
2888	if (BlockInMask \|\| MaskForGaps) {
2889	Value *GroupMask = MaskForGaps;
2890	if (BlockInMask) {
2891	Value *BlockInMaskPart = State.get(BlockInMask, Part);
2892	Value *ShuffledMask = Builder.CreateShuffleVector(
2893	BlockInMaskPart,
2894	createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2895	"interleaved.mask");
2896	GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2897	ShuffledMask, MaskForGaps)
2898	: ShuffledMask;
2899	}
2900	NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2901	Group->getAlign(), GroupMask);
2902	} else
2903	NewStoreInstr =
2904	Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2905
2906	Group->addMetadata(NewStoreInstr);
2907	}
2908	}
2909
2910	void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2911	VPReplicateRecipe *RepRecipe,
2912	const VPIteration &Instance,
2913	bool IfPredicateInstr,
2914	VPTransformState &State) {
2915	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType () && "Can't handle vectors") ? void (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2915, __extension__ __PRETTY_FUNCTION__));
2916
2917	// llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2918	// the first lane and part.
2919	if (isa<NoAliasScopeDeclInst>(Instr))
2920	if (!Instance.isFirstIteration())
2921	return;
2922
2923	setDebugLocFromInst(Instr);
2924
2925	// Does this instruction return a value ?
2926	bool IsVoidRetTy = Instr->getType()->isVoidTy();
2927
2928	Instruction *Cloned = Instr->clone();
2929	if (!IsVoidRetTy)
2930	Cloned->setName(Instr->getName() + ".cloned");
2931
2932	// If the scalarized instruction contributes to the address computation of a
2933	// widen masked load/store which was in a basic block that needed predication
2934	// and is not predicated after vectorization, we can't propagate
2935	// poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2936	// instruction could feed a poison value to the base address of the widen
2937	// load/store.
2938	if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2939	Cloned->dropPoisonGeneratingFlags();
2940
2941	State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2942	Builder.GetInsertPoint());
2943	// Replace the operands of the cloned instructions with their scalar
2944	// equivalents in the new loop.
2945	for (auto &I : enumerate(RepRecipe->operands())) {
2946	auto InputInstance = Instance;
2947	VPValue *Operand = I.value();
2948	VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2949	if (OperandR && OperandR->isUniform())
2950	InputInstance.Lane = VPLane::getFirstLane();
2951	Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2952	}
2953	addNewMetadata(Cloned, Instr);
2954
2955	// Place the cloned scalar in the new loop.
2956	Builder.Insert(Cloned);
2957
2958	State.set(RepRecipe, Cloned, Instance);
2959
2960	// If we just cloned a new assumption, add it the assumption cache.
2961	if (auto *II = dyn_cast<AssumeInst>(Cloned))
2962	AC->registerAssumption(II);
2963
2964	// End if-block.
2965	if (IfPredicateInstr)
2966	PredicatedInstructions.push_back(Cloned);
2967	}
2968
2969	void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2970	BasicBlock *Header = L->getHeader();
2971	assert(!L->getLoopLatch() && "loop should not have a latch at this point")(static_cast <bool> (!L->getLoopLatch() && "loop should not have a latch at this point" ) ? void (0) : __assert_fail ("!L->getLoopLatch() && \"loop should not have a latch at this point\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2971, __extension__ __PRETTY_FUNCTION__));
2972
2973	IRBuilder<> B(Header->getTerminator());
2974	Instruction *OldInst =
2975	getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2976	setDebugLocFromInst(OldInst, &B);
2977
2978	// Connect the header to the exit and header blocks and replace the old
2979	// terminator.
2980	B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2981
2982	// Now we have two terminators. Remove the old one from the block.
2983	Header->getTerminator()->eraseFromParent();
2984	}
2985
2986	Value InnerLoopVectorizer::getOrCreateTripCount(Loop L) {
2987	if (TripCount)
2988	return TripCount;
2989
2990	assert(L && "Create Trip Count for null loop.")(static_cast <bool> (L && "Create Trip Count for null loop." ) ? void (0) : __assert_fail ("L && \"Create Trip Count for null loop.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2990, __extension__ __PRETTY_FUNCTION__));
2991	IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2992	// Find the loop boundaries.
2993	ScalarEvolution *SE = PSE.getSE();
2994	const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2995	assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount ) && "Invalid loop count") ? void (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2996, __extension__ __PRETTY_FUNCTION__))
2996	"Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount ) && "Invalid loop count") ? void (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2996, __extension__ __PRETTY_FUNCTION__));
2997
2998	Type *IdxTy = Legal->getWidestInductionType();
2999	assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction" ) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2999, __extension__ __PRETTY_FUNCTION__));
3000
3001	// The exit count might have the type of i64 while the phi is i32. This can
3002	// happen if we have an induction variable that is sign extended before the
3003	// compare. The only way that we get a backedge taken count is that the
3004	// induction variable was signed and as such will not overflow. In such a case
3005	// truncation is legal.
3006	if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3007	IdxTy->getPrimitiveSizeInBits())
3008	BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3009	BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3010
3011	// Get the total trip count from the count by adding 1.
3012	const SCEV *ExitCount = SE->getAddExpr(
3013	BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3014
3015	const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3016
3017	// Expand the trip count and place the new instructions in the preheader.
3018	// Notice that the pre-header does not change, only the loop body.
3019	SCEVExpander Exp(*SE, DL, "induction");
3020
3021	// Count holds the overall loop count (N).
3022	TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3023	L->getLoopPreheader()->getTerminator());
3024
3025	if (TripCount->getType()->isPointerTy())
3026	TripCount =
3027	CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3028	L->getLoopPreheader()->getTerminator());
3029
3030	return TripCount;
3031	}
3032
3033	Value InnerLoopVectorizer::getOrCreateVectorTripCount(Loop L) {
3034	if (VectorTripCount)
3035	return VectorTripCount;
3036
3037	Value *TC = getOrCreateTripCount(L);
3038	IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3039
3040	Type *Ty = TC->getType();
3041	// This is where we can make the step a runtime constant.
3042	Value *Step = createStepForVF(Builder, Ty, VF, UF);
3043
3044	// If the tail is to be folded by masking, round the number of iterations N
3045	// up to a multiple of Step instead of rounding down. This is done by first
3046	// adding Step-1 and then rounding down. Note that it's ok if this addition
3047	// overflows: the vector induction variable will eventually wrap to zero given
3048	// that it starts at zero and its Step is a power of two; the loop will then
3049	// exit, with the last early-exit vector comparison also producing all-true.
3050	if (Cost->foldTailByMasking()) {
3051	assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue( ) * UF) && "VFUF must be a power of 2 when folding tail by masking" ) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() UF) && \"VF*UF must be a power of 2 when folding tail by masking\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3052, __extension__ __PRETTY_FUNCTION__))
3052	"VFUF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue( ) UF) && "VFUF must be a power of 2 when folding tail by masking" ) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() UF) && \"VF*UF must be a power of 2 when folding tail by masking\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3052, __extension__ __PRETTY_FUNCTION__));
3053	Value NumLanes = getRuntimeVF(Builder, Ty, VF UF);
3054	TC = Builder.CreateAdd(
3055	TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
3056	}
3057
3058	// Now we need to generate the expression for the part of the loop that the
3059	// vectorized body will execute. This is equal to N - (N % Step) if scalar
3060	// iterations are not required for correctness, or N - Step, otherwise. Step
3061	// is equal to the vectorization factor (number of SIMD elements) times the
3062	// unroll factor (number of SIMD instructions).
3063	Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3064
3065	// There are cases where we must run at least one iteration in the remainder
3066	// loop. See the cost model for when this can happen. If the step evenly
3067	// divides the trip count, we set the remainder to be equal to the step. If
3068	// the step does not evenly divide the trip count, no adjustment is necessary
3069	// since there will already be scalar iterations. Note that the minimum
3070	// iterations check ensures that N >= Step.
3071	if (Cost->requiresScalarEpilogue(VF)) {
3072	auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3073	R = Builder.CreateSelect(IsZero, Step, R);
3074	}
3075
3076	VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3077
3078	return VectorTripCount;
3079	}
3080
3081	Value InnerLoopVectorizer::createBitOrPointerCast(Value V, VectorType *DstVTy,
3082	const DataLayout &DL) {
3083	// Verify that V is a vector type with same number of elements as DstVTy.
3084	auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3085	unsigned VF = DstFVTy->getNumElements();
3086	auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3087	assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements ()) && "Vector dimensions do not match") ? void (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3087, __extension__ __PRETTY_FUNCTION__));
3088	Type *SrcElemTy = SrcVecTy->getElementType();
3089	Type *DstElemTy = DstFVTy->getElementType();
3090	assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size" ) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3091, __extension__ __PRETTY_FUNCTION__))
3091	"Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size" ) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3091, __extension__ __PRETTY_FUNCTION__));
3092
3093	// Do a direct cast if element types are castable.
3094	if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3095	return Builder.CreateBitOrPointerCast(V, DstFVTy);
3096	}
3097	// V cannot be directly casted to desired vector type.
3098	// May happen when V is a floating point vector but DstVTy is a vector of
3099	// pointers or vice-versa. Handle this using a two-step bitcast using an
3100	// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3101	assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy ->isPointerTy()) && "Only one type should be a pointer type" ) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3102, __extension__ __PRETTY_FUNCTION__))
3102	"Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy ->isPointerTy()) && "Only one type should be a pointer type" ) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3102, __extension__ __PRETTY_FUNCTION__));
3103	assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type" ) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3104, __extension__ __PRETTY_FUNCTION__))
3104	"Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type" ) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3104, __extension__ __PRETTY_FUNCTION__));
3105	Type *IntTy =
3106	IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3107	auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3108	Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3109	return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3110	}
3111
3112	void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3113	BasicBlock *Bypass) {
3114	Value *Count = getOrCreateTripCount(L);
3115	// Reuse existing vector loop preheader for TC checks.
3116	// Note that new preheader block is generated for vector loop.
3117	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3118	IRBuilder<> Builder(TCCheckBlock->getTerminator());
3119
3120	// Generate code to check if the loop's trip count is less than VF * UF, or
3121	// equal to it in case a scalar epilogue is required; this implies that the
3122	// vector trip count is zero. This check also covers the case where adding one
3123	// to the backedge-taken count overflowed leading to an incorrect trip count
3124	// of zero. In this case we will also jump to the scalar loop.
3125	auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3126	: ICmpInst::ICMP_ULT;
3127
3128	// If tail is to be folded, vector loop takes care of all iterations.
3129	Value *CheckMinIters = Builder.getFalse();
3130	if (!Cost->foldTailByMasking()) {
3131	Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3132	CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3133	}
3134	// Create new preheader for vector loop.
3135	LoopVectorPreHeader =
3136	SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3137	"vector.ph");
3138
3139	assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3141, __extension__ __PRETTY_FUNCTION__))
3140	DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3141, __extension__ __PRETTY_FUNCTION__))
3141	"TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3141, __extension__ __PRETTY_FUNCTION__));
3142
3143	// Update dominator for Bypass & LoopExit (if needed).
3144	DT->changeImmediateDominator(Bypass, TCCheckBlock);
3145	if (!Cost->requiresScalarEpilogue(VF))
3146	// If there is an epilogue which must run, there's no edge from the
3147	// middle block to exit blocks and thus no need to update the immediate
3148	// dominator of the exit blocks.
3149	DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3150
3151	ReplaceInstWithInst(
3152	TCCheckBlock->getTerminator(),
3153	BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3154	LoopBypassBlocks.push_back(TCCheckBlock);
3155	}
3156
3157	BasicBlock InnerLoopVectorizer::emitSCEVChecks(Loop L, BasicBlock *Bypass) {
3158
3159	BasicBlock *const SCEVCheckBlock =
3160	RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3161	if (!SCEVCheckBlock)
3162	return nullptr;
3163
3164	assert(!(SCEVCheckBlock->getParent()->hasOptSize() \|\|(static_cast <bool> (!(SCEVCheckBlock->getParent()-> hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost-> Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size" ) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3167, __extension__ __PRETTY_FUNCTION__))
3165	(OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()-> hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost-> Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size" ) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3167, __extension__ __PRETTY_FUNCTION__))
3166	Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()-> hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost-> Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size" ) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3167, __extension__ __PRETTY_FUNCTION__))
3167	"Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()-> hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost-> Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size" ) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() \|\| (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3167, __extension__ __PRETTY_FUNCTION__));
3168
3169
3170	// Update dominator only if this is first RT check.
3171	if (LoopBypassBlocks.empty()) {
3172	DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3173	if (!Cost->requiresScalarEpilogue(VF))
3174	// If there is an epilogue which must run, there's no edge from the
3175	// middle block to exit blocks and thus no need to update the immediate
3176	// dominator of the exit blocks.
3177	DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3178	}
3179
3180	LoopBypassBlocks.push_back(SCEVCheckBlock);
3181	AddedSafetyChecks = true;
3182	return SCEVCheckBlock;
3183	}
3184
3185	BasicBlock InnerLoopVectorizer::emitMemRuntimeChecks(Loop L,
3186	BasicBlock *Bypass) {
3187	// VPlan-native path does not do any analysis for runtime checks currently.
3188	if (EnableVPlanNativePath)
3189	return nullptr;
3190
3191	BasicBlock *const MemCheckBlock =
3192	RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3193
3194	// Check if we generated code that checks in runtime if arrays overlap. We put
3195	// the checks into a separate block to make the more common case of few
3196	// elements faster.
3197	if (!MemCheckBlock)
3198	return nullptr;
3199
3200	if (MemCheckBlock->getParent()->hasOptSize() \|\| OptForSizeBasedOnProfile) {
3201	assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints ::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3203, __extension__ __PRETTY_FUNCTION__))
3202	"Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints ::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3203, __extension__ __PRETTY_FUNCTION__))
3203	"to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints ::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3203, __extension__ __PRETTY_FUNCTION__));
3204	ORE->emit([&]() {
3205	return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3206	L->getStartLoc(), L->getHeader())
3207	<< "Code-size may be reduced by not forcing "
3208	"vectorization, or by source-code modifications "
3209	"eliminating the need for runtime checks "
3210	"(e.g., adding 'restrict').";
3211	});
3212	}
3213
3214	LoopBypassBlocks.push_back(MemCheckBlock);
3215
3216	AddedSafetyChecks = true;
3217
3218	// We currently don't use LoopVersioning for the actual loop cloning but we
3219	// still use it to add the noalias metadata.
3220	LVer = std::make_unique<LoopVersioning>(
3221	*Legal->getLAI(),
3222	Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3223	DT, PSE.getSE());
3224	LVer->prepareNoAliasMetadata();
3225	return MemCheckBlock;
3226	}
3227
3228	Value *InnerLoopVectorizer::emitTransformedIndex(
3229	IRBuilder<> &B, Value Index, ScalarEvolution SE, const DataLayout &DL,
3230	const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3231
3232	SCEVExpander Exp(*SE, DL, "induction");
3233	auto Step = ID.getStep();
3234	auto StartValue = ID.getStartValue();
3235	assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType () == Step->getType() && "Index scalar type does not match StepValue type" ) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3236, __extension__ __PRETTY_FUNCTION__))
3236	"Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType () == Step->getType() && "Index scalar type does not match StepValue type" ) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3236, __extension__ __PRETTY_FUNCTION__));
3237
3238	// Note: the IR at this point is broken. We cannot use SE to create any new
3239	// SCEV and then expand it, hoping that SCEV's simplification will give us
3240	// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3241	// lead to various SCEV crashes. So all we can do is to use builder and rely
3242	// on InstCombine for future simplifications. Here we handle some trivial
3243	// cases only.
3244	auto CreateAdd = [&B](Value X, Value Y) {
3245	assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType() && "Types don't match!") ? void (0) : __assert_fail ( "X->getType() == Y->getType() && \"Types don't match!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3245, __extension__ __PRETTY_FUNCTION__));
3246	if (auto *CX = dyn_cast<ConstantInt>(X))
3247	if (CX->isZero())
3248	return Y;
3249	if (auto *CY = dyn_cast<ConstantInt>(Y))
3250	if (CY->isZero())
3251	return X;
3252	return B.CreateAdd(X, Y);
3253	};
3254
3255	// We allow X to be a vector type, in which case Y will potentially be
3256	// splatted into a vector with the same element count.
3257	auto CreateMul = [&B](Value X, Value Y) {
3258	assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType( ) == Y->getType() && "Types don't match!") ? void ( 0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3259, __extension__ __PRETTY_FUNCTION__))
3259	"Types don't match!")(static_cast <bool> (X->getType()->getScalarType( ) == Y->getType() && "Types don't match!") ? void ( 0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3259, __extension__ __PRETTY_FUNCTION__));
3260	if (auto *CX = dyn_cast<ConstantInt>(X))
3261	if (CX->isOne())
3262	return Y;
3263	if (auto *CY = dyn_cast<ConstantInt>(Y))
3264	if (CY->isOne())
3265	return X;
3266	VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3267	if (XVTy && !isa<VectorType>(Y->getType()))
3268	Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3269	return B.CreateMul(X, Y);
3270	};
3271
3272	// Get a suitable insert point for SCEV expansion. For blocks in the vector
3273	// loop, choose the end of the vector loop header (=VectorHeader), because
3274	// the DomTree is not kept up-to-date for additional blocks generated in the
3275	// vector loop. By using the header as insertion point, we guarantee that the
3276	// expanded instructions dominate all their uses.
3277	auto GetInsertPoint = [this, &B, VectorHeader]() {
3278	BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3279	if (InsertBB != LoopVectorBody &&
3280	LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3281	return VectorHeader->getTerminator();
3282	return &*B.GetInsertPoint();
3283	};
3284
3285	switch (ID.getKind()) {
3286	case InductionDescriptor::IK_IntInduction: {
3287	assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType ()) && "Vector indices not supported for integer inductions yet" ) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3288, __extension__ __PRETTY_FUNCTION__))
3288	"Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType ()) && "Vector indices not supported for integer inductions yet" ) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3288, __extension__ __PRETTY_FUNCTION__));
3289	assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue-> getType() && "Index type does not match StartValue type" ) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3290, __extension__ __PRETTY_FUNCTION__))
3290	"Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue-> getType() && "Index type does not match StartValue type" ) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3290, __extension__ __PRETTY_FUNCTION__));
3291	if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3292	return B.CreateSub(StartValue, Index);
3293	auto *Offset = CreateMul(
3294	Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3295	return CreateAdd(StartValue, Offset);
3296	}
3297	case InductionDescriptor::IK_PtrInduction: {
3298	assert(isa<SCEVConstant>(Step) &&(static_cast <bool> (isa<SCEVConstant>(Step) && "Expected constant step for pointer induction") ? void (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3299, __extension__ __PRETTY_FUNCTION__))
3299	"Expected constant step for pointer induction")(static_cast <bool> (isa<SCEVConstant>(Step) && "Expected constant step for pointer induction") ? void (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3299, __extension__ __PRETTY_FUNCTION__));
3300	return B.CreateGEP(
3301	ID.getElementType(), StartValue,
3302	CreateMul(Index,
3303	Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3304	GetInsertPoint())));
3305	}
3306	case InductionDescriptor::IK_FpInduction: {
3307	assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType ()) && "Vector indices not supported for FP inductions yet" ) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3308, __extension__ __PRETTY_FUNCTION__))
3308	"Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType ()) && "Vector indices not supported for FP inductions yet" ) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3308, __extension__ __PRETTY_FUNCTION__));
3309	assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy () && "Expected FP Step value") ? void (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3309, __extension__ __PRETTY_FUNCTION__));
3310	auto InductionBinOp = ID.getInductionBinOp();
3311	assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp ->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode () == Instruction::FSub) && "Original bin op should be defined for FP induction" ) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3314, __extension__ __PRETTY_FUNCTION__))
3312	(InductionBinOp->getOpcode() == Instruction::FAdd \|\|(static_cast <bool> (InductionBinOp && (InductionBinOp ->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode () == Instruction::FSub) && "Original bin op should be defined for FP induction" ) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3314, __extension__ __PRETTY_FUNCTION__))
3313	InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp ->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode () == Instruction::FSub) && "Original bin op should be defined for FP induction" ) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3314, __extension__ __PRETTY_FUNCTION__))
3314	"Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp ->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode () == Instruction::FSub) && "Original bin op should be defined for FP induction" ) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd \|\| InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3314, __extension__ __PRETTY_FUNCTION__));
3315
3316	Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3317	Value *MulExp = B.CreateFMul(StepValue, Index);
3318	return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3319	"induction");
3320	}
3321	case InductionDescriptor::IK_NoInduction:
3322	return nullptr;
3323	}
3324	llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3324);
3325	}
3326
3327	Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3328	LoopScalarBody = OrigLoop->getHeader();
3329	LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3330	assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure" ) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3330, __extension__ __PRETTY_FUNCTION__));
3331	LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3332	assert((LoopExitBlock \|\| Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock \|\| Cost->requiresScalarEpilogue (VF)) && "multiple exit loop without required epilogue?" ) ? void (0) : __assert_fail ("(LoopExitBlock \|\| Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3333, __extension__ __PRETTY_FUNCTION__))
3333	"multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock \|\| Cost->requiresScalarEpilogue (VF)) && "multiple exit loop without required epilogue?" ) ? void (0) : __assert_fail ("(LoopExitBlock \|\| Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3333, __extension__ __PRETTY_FUNCTION__));
3334
3335	LoopMiddleBlock =
3336	SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3337	LI, nullptr, Twine(Prefix) + "middle.block");
3338	LoopScalarPreHeader =
3339	SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3340	nullptr, Twine(Prefix) + "scalar.ph");
3341
3342	auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3343
3344	// Set up the middle block terminator. Two cases:
3345	// 1) If we know that we must execute the scalar epilogue, emit an
3346	// unconditional branch.
3347	// 2) Otherwise, we must have a single unique exit block (due to how we
3348	// implement the multiple exit case). In this case, set up a conditonal
3349	// branch from the middle block to the loop scalar preheader, and the
3350	// exit block. completeLoopSkeleton will update the condition to use an
3351	// iteration check, if required to decide whether to execute the remainder.
3352	BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3353	BranchInst::Create(LoopScalarPreHeader) :
3354	BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3355	Builder.getTrue());
3356	BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3357	ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3358
3359	// We intentionally don't let SplitBlock to update LoopInfo since
3360	// LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3361	// LoopVectorBody is explicitly added to the correct place few lines later.
3362	LoopVectorBody =
3363	SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3364	nullptr, nullptr, Twine(Prefix) + "vector.body");
3365
3366	// Update dominator for loop exit.
3367	if (!Cost->requiresScalarEpilogue(VF))
3368	// If there is an epilogue which must run, there's no edge from the
3369	// middle block to exit blocks and thus no need to update the immediate
3370	// dominator of the exit blocks.
3371	DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3372
3373	// Create and register the new vector loop.
3374	Loop *Lp = LI->AllocateLoop();
3375	Loop *ParentLoop = OrigLoop->getParentLoop();
3376
3377	// Insert the new loop into the loop nest and register the new basic blocks
3378	// before calling any utilities such as SCEV that require valid LoopInfo.
3379	if (ParentLoop) {
3380	ParentLoop->addChildLoop(Lp);
3381	} else {
3382	LI->addTopLevelLoop(Lp);
3383	}
3384	Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3385	return Lp;
3386	}
3387
3388	void InnerLoopVectorizer::createInductionResumeValues(
3389	Loop L, std::pair<BasicBlock , Value *> AdditionalBypass) {
3390	assert(((AdditionalBypass.first && AdditionalBypass.second) \|\|(static_cast <bool> (((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass." ) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3392, __extension__ __PRETTY_FUNCTION__))
3391	(!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass." ) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3392, __extension__ __PRETTY_FUNCTION__))
3392	"Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass." ) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) \|\| (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3392, __extension__ __PRETTY_FUNCTION__));
3393
3394	Value *VectorTripCount = getOrCreateVectorTripCount(L);
3395	assert(VectorTripCount && L && "Expected valid arguments")(static_cast <bool> (VectorTripCount && L && "Expected valid arguments") ? void (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3395, __extension__ __PRETTY_FUNCTION__));
3396	// We are going to resume the execution of the scalar loop.
3397	// Go over all of the induction variables that we found and fix the
3398	// PHIs that are left in the scalar version of the loop.
3399	// The starting values of PHI nodes depend on the counter of the last
3400	// iteration in the vectorized loop.
3401	// If we come from a bypass edge then we need to start from the original
3402	// start value.
3403	Instruction *OldInduction = Legal->getPrimaryInduction();
3404	for (auto &InductionEntry : Legal->getInductionVars()) {
3405	PHINode *OrigPhi = InductionEntry.first;
3406	InductionDescriptor II = InductionEntry.second;
3407
3408	// Create phi nodes to merge from the backedge-taken check block.
3409	PHINode *BCResumeVal =
3410	PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3411	LoopScalarPreHeader->getTerminator());
3412	// Copy original phi DL over to the new one.
3413	BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3414	Value *&EndValue = IVEndValues[OrigPhi];
3415	Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3416	if (OrigPhi == OldInduction) {
3417	// We know what the end value is.
3418	EndValue = VectorTripCount;
3419	} else {
3420	IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3421
3422	// Fast-math-flags propagate from the original induction instruction.
3423	if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3424	B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3425
3426	Type *StepType = II.getStep()->getType();
3427	Instruction::CastOps CastOp =
3428	CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3429	Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3430	const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3431	EndValue =
3432	emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3433	EndValue->setName("ind.end");
3434
3435	// Compute the end value for the additional bypass (if applicable).
3436	if (AdditionalBypass.first) {
3437	B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3438	CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3439	StepType, true);
3440	CRD =
3441	B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3442	EndValueFromAdditionalBypass =
3443	emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3444	EndValueFromAdditionalBypass->setName("ind.end");
3445	}
3446	}
3447	// The new PHI merges the original incoming value, in case of a bypass,
3448	// or the value at the end of the vectorized loop.
3449	BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3450
3451	// Fix the scalar body counter (PHI node).
3452	// The old induction's phi node in the scalar body needs the truncated
3453	// value.
3454	for (BasicBlock *BB : LoopBypassBlocks)
3455	BCResumeVal->addIncoming(II.getStartValue(), BB);
3456
3457	if (AdditionalBypass.first)
3458	BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3459	EndValueFromAdditionalBypass);
3460
3461	OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3462	}
3463	}
3464
3465	BasicBlock InnerLoopVectorizer::completeLoopSkeleton(Loop L,
3466	MDNode *OrigLoopID) {
3467	assert(L && "Expected valid loop.")(static_cast <bool> (L && "Expected valid loop." ) ? void (0) : __assert_fail ("L && \"Expected valid loop.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3467, __extension__ __PRETTY_FUNCTION__));
3468
3469	// The trip counts should be cached by now.
3470	Value *Count = getOrCreateTripCount(L);
3471	Value *VectorTripCount = getOrCreateVectorTripCount(L);
3472
3473	auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3474
3475	// Add a check in the middle block to see if we have completed
3476	// all of the iterations in the first vector loop. Three cases:
3477	// 1) If we require a scalar epilogue, there is no conditional branch as
3478	// we unconditionally branch to the scalar preheader. Do nothing.
3479	// 2) If (N - N%VF) == N, then we don't need to run the remainder.
3480	// Thus if tail is to be folded, we know we don't need to run the
3481	// remainder and we can use the previous value for the condition (true).
3482	// 3) Otherwise, construct a runtime check.
3483	if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3484	Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3485	Count, VectorTripCount, "cmp.n",
3486	LoopMiddleBlock->getTerminator());
3487
3488	// Here we use the same DebugLoc as the scalar loop latch terminator instead
3489	// of the corresponding compare because they may have ended up with
3490	// different line numbers and we want to avoid awkward line stepping while
3491	// debugging. Eg. if the compare has got a line number inside the loop.
3492	CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3493	cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3494	}
3495
3496	// Get ready to start creating new instructions into the vectorized body.
3497	assert(LoopVectorPreHeader == L->getLoopPreheader() &&(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader () && "Inconsistent vector loop preheader") ? void (0 ) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3498, __extension__ __PRETTY_FUNCTION__))
3498	"Inconsistent vector loop preheader")(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader () && "Inconsistent vector loop preheader") ? void (0 ) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3498, __extension__ __PRETTY_FUNCTION__));
3499	Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3500
3501	#ifdef EXPENSIVE_CHECKS
3502	assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel ::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3502, __extension__ __PRETTY_FUNCTION__));
3503	LI->verify(*DT);
3504	#endif
3505
3506	return LoopVectorPreHeader;
3507	}
3508
3509	std::pair<BasicBlock , Value >
3510	InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3511	/*
3512	In this function we generate a new loop. The new loop will contain
3513	the vectorized instructions while the old loop will continue to run the
3514	scalar remainder.
3515
3516	[ ] <-- loop iteration number check.
3517	/ \|
3518	/ v
3519	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
3520	\| / \|
3521	\| / v
3522	\|\| [ ] <-- vector pre header.
3523	\|/ \|
3524	\| v
3525	\| [ ] \
3526	\| [ ]_\| <-- vector loop.
3527	\| \|
3528	\| v
3529	\ -[ ] <--- middle-block.
3530	\/ \|
3531	/\ v
3532	\| ->[ ] <--- new preheader.
3533	\| \|
3534	(opt) v <-- edge from middle to exit iff epilogue is not required.
3535	\| [ ] \
3536	\| [ ]_\| <-- old scalar loop to handle remainder (scalar epilogue).
3537	\ \|
3538	\ v
3539	>[ ] <-- exit block(s).
3540	...
3541	*/
3542
3543	// Get the metadata of the original loop before it gets modified.
3544	MDNode *OrigLoopID = OrigLoop->getLoopID();
3545
3546	// Workaround! Compute the trip count of the original loop and cache it
3547	// before we start modifying the CFG. This code has a systemic problem
3548	// wherein it tries to run analysis over partially constructed IR; this is
3549	// wrong, and not simply for SCEV. The trip count of the original loop
3550	// simply happens to be prone to hitting this in practice. In theory, we
3551	// can hit the same issue for any SCEV, or ValueTracking query done during
3552	// mutation. See PR49900.
3553	getOrCreateTripCount(OrigLoop);
3554
3555	// Create an empty vector loop, and prepare basic blocks for the runtime
3556	// checks.
3557	Loop *Lp = createVectorLoopSkeleton("");
3558
3559	// Now, compare the new count to zero. If it is zero skip the vector loop and
3560	// jump to the scalar loop. This check also covers the case where the
3561	// backedge-taken count is uint##_max: adding one to it will overflow leading
3562	// to an incorrect trip count of zero. In this (rare) case we will also jump
3563	// to the scalar loop.
3564	emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3565
3566	// Generate the code to check any assumptions that we've made for SCEV
3567	// expressions.
3568	emitSCEVChecks(Lp, LoopScalarPreHeader);
3569
3570	// Generate the code that checks in runtime if arrays overlap. We put the
3571	// checks into a separate block to make the more common case of few elements
3572	// faster.
3573	emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3574
3575	createHeaderBranch(Lp);
3576
3577	// Emit phis for the new starting index of the scalar loop.
3578	createInductionResumeValues(Lp);
3579
3580	return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3581	}
3582
3583	// Fix up external users of the induction variable. At this point, we are
3584	// in LCSSA form, with all external PHIs that use the IV having one input value,
3585	// coming from the remainder loop. We need those PHIs to also have a correct
3586	// value for the IV when arriving directly from the middle block.
3587	void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3588	const InductionDescriptor &II,
3589	Value CountRoundDown, Value EndValue,
3590	BasicBlock *MiddleBlock) {
3591	// There are two kinds of external IV usages - those that use the value
3592	// computed in the last iteration (the PHI) and those that use the penultimate
3593	// value (the value that feeds into the phi from the loop latch).
3594	// We allow both, but they, obviously, have different values.
3595
3596	assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() && "Expected a single exit block") ? void (0) : __assert_fail ( "OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3596, __extension__ __PRETTY_FUNCTION__));
3597
3598	DenseMap<Value , Value > MissingVals;
3599
3600	// An external user of the last iteration's value should see the value that
3601	// the remainder loop uses to initialize its own IV.
3602	Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3603	for (User *U : PostInc->users()) {
3604	Instruction *UI = cast<Instruction>(U);
3605	if (!OrigLoop->contains(UI)) {
3606	assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form" ) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3606, __extension__ __PRETTY_FUNCTION__));
3607	MissingVals[UI] = EndValue;
3608	}
3609	}
3610
3611	// An external user of the penultimate value need to see EndValue - Step.
3612	// The simplest way to get this is to recompute it from the constituent SCEVs,
3613	// that is Start + (Step * (CRD - 1)).
3614	for (User *U : OrigPhi->users()) {
3615	auto *UI = cast<Instruction>(U);
3616	if (!OrigLoop->contains(UI)) {
3617	const DataLayout &DL =
3618	OrigLoop->getHeader()->getModule()->getDataLayout();
3619	assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form" ) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3619, __extension__ __PRETTY_FUNCTION__));
3620
3621	IRBuilder<> B(MiddleBlock->getTerminator());
3622
3623	// Fast-math-flags propagate from the original induction instruction.
3624	if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3625	B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3626
3627	Value *CountMinusOne = B.CreateSub(
3628	CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3629	Value *CMO =
3630	!II.getStep()->getType()->isIntegerTy()
3631	? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3632	II.getStep()->getType())
3633	: B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3634	CMO->setName("cast.cmo");
3635	Value *Escape =
3636	emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3637	Escape->setName("ind.escape");
3638	MissingVals[UI] = Escape;
3639	}
3640	}
3641
3642	for (auto &I : MissingVals) {
3643	PHINode *PHI = cast<PHINode>(I.first);
3644	// One corner case we have to handle is two IVs "chasing" each-other,
3645	// that is %IV2 = phi [...], [ %IV1, %latch ]
3646	// In this case, if IV1 has an external use, we need to avoid adding both
3647	// "last value of IV1" and "penultimate value of IV2". So, verify that we
3648	// don't already have an incoming value for the middle block.
3649	if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3650	PHI->addIncoming(I.second, MiddleBlock);
3651	}
3652	}
3653
3654	namespace {
3655
3656	struct CSEDenseMapInfo {
3657	static bool canHandle(const Instruction *I) {
3658	return isa<InsertElementInst>(I) \|\| isa<ExtractElementInst>(I) \|\|
3659	isa<ShuffleVectorInst>(I) \|\| isa<GetElementPtrInst>(I);
3660	}
3661
3662	static inline Instruction *getEmptyKey() {
3663	return DenseMapInfo<Instruction *>::getEmptyKey();
3664	}
3665
3666	static inline Instruction *getTombstoneKey() {
3667	return DenseMapInfo<Instruction *>::getTombstoneKey();
3668	}
3669
3670	static unsigned getHashValue(const Instruction *I) {
3671	assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!" ) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3671, __extension__ __PRETTY_FUNCTION__));
3672	return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3673	I->value_op_end()));
3674	}
3675
3676	static bool isEqual(const Instruction LHS, const Instruction RHS) {
3677	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
3678	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
3679	return LHS == RHS;
3680	return LHS->isIdenticalTo(RHS);
3681	}
3682	};
3683
3684	} // end anonymous namespace
3685
3686	///Perform cse of induction variable instructions.
3687	static void cse(BasicBlock *BB) {
3688	// Perform simple cse.
3689	SmallDenseMap<Instruction , Instruction , 4, CSEDenseMapInfo> CSEMap;
3690	for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3691	if (!CSEDenseMapInfo::canHandle(&In))
3692	continue;
3693
3694	// Check if we can replace this instruction with any of the
3695	// visited instructions.
3696	if (Instruction *V = CSEMap.lookup(&In)) {
3697	In.replaceAllUsesWith(V);
3698	In.eraseFromParent();
3699	continue;
3700	}
3701
3702	CSEMap[&In] = &In;
3703	}
3704	}
3705
3706	InstructionCost
3707	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3708	bool &NeedToScalarize) const {
3709	Function *F = CI->getCalledFunction();
3710	Type *ScalarRetTy = CI->getType();
3711	SmallVector<Type *, 4> Tys, ScalarTys;
3712	for (auto &ArgOp : CI->args())
3713	ScalarTys.push_back(ArgOp->getType());
3714
3715	// Estimate cost of scalarized vector call. The source operands are assumed
3716	// to be vectors, so we need to extract individual elements from there,
3717	// execute VF scalar calls, and then gather the result into the vector return
3718	// value.
3719	InstructionCost ScalarCallCost =
3720	TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3721	if (VF.isScalar())
3722	return ScalarCallCost;
3723
3724	// Compute corresponding vector type for return value and arguments.
3725	Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3726	for (Type *ScalarTy : ScalarTys)
3727	Tys.push_back(ToVectorTy(ScalarTy, VF));
3728
3729	// Compute costs of unpacking argument values for the scalar calls and
3730	// packing the return values to a vector.
3731	InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3732
3733	InstructionCost Cost =
3734	ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3735
3736	// If we can't emit a vector call for this function, then the currently found
3737	// cost is the cost we need to return.
3738	NeedToScalarize = true;
3739	VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred*/);
3740	Function VecFunc = VFDatabase(CI).getVectorizedFunction(Shape);
3741
3742	if (!TLI \|\| CI->isNoBuiltin() \|\| !VecFunc)
3743	return Cost;
3744
3745	// If the corresponding vector cost is cheaper, return its cost.
3746	InstructionCost VectorCallCost =
3747	TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3748	if (VectorCallCost < Cost) {
3749	NeedToScalarize = false;
3750	Cost = VectorCallCost;
3751	}
3752	return Cost;
3753	}
3754
3755	static Type MaybeVectorizeType(Type Elt, ElementCount VF) {
3756	if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3757	return Elt;
3758	return VectorType::get(Elt, VF);
3759	}
3760
3761	InstructionCost
3762	LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3763	ElementCount VF) const {
3764	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3765	assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!" ) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3765, __extension__ __PRETTY_FUNCTION__));
3766	Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3767	FastMathFlags FMF;
3768	if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3769	FMF = FPMO->getFastMathFlags();
3770
3771	SmallVector<const Value *> Arguments(CI->args());
3772	FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3773	SmallVector<Type *> ParamTys;
3774	std::transform(FTy->param_begin(), FTy->param_end(),
3775	std::back_inserter(ParamTys),
3776	[&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3777
3778	IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3779	dyn_cast<IntrinsicInst>(CI));
3780	return TTI.getIntrinsicInstrCost(CostAttrs,
3781	TargetTransformInfo::TCK_RecipThroughput);
3782	}
3783
3784	static Type smallestIntegerVectorType(Type T1, Type *T2) {
3785	auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3786	auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3787	return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3788	}
3789
3790	static Type largestIntegerVectorType(Type T1, Type *T2) {
3791	auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3792	auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3793	return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3794	}
3795
3796	void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3797	// For every instruction `I` in MinBWs, truncate the operands, create a
3798	// truncated version of `I` and reextend its result. InstCombine runs
3799	// later and will remove any ext/trunc pairs.
3800	SmallPtrSet<Value *, 4> Erased;
3801	for (const auto &KV : Cost->getMinimalBitwidths()) {
3802	// If the value wasn't vectorized, we must maintain the original scalar
3803	// type. The absence of the value from State indicates that it
3804	// wasn't vectorized.
3805	// FIXME: Should not rely on getVPValue at this point.
3806	VPValue *Def = State.Plan->getVPValue(KV.first, true);
3807	if (!State.hasAnyVectorValue(Def))
3808	continue;
3809	for (unsigned Part = 0; Part < UF; ++Part) {
3810	Value *I = State.get(Def, Part);
3811	if (Erased.count(I) \|\| I->use_empty() \|\| !isa<Instruction>(I))
3812	continue;
3813	Type *OriginalTy = I->getType();
3814	Type *ScalarTruncatedTy =
3815	IntegerType::get(OriginalTy->getContext(), KV.second);
3816	auto *TruncatedTy = VectorType::get(
3817	ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3818	if (TruncatedTy == OriginalTy)
3819	continue;
3820
3821	IRBuilder<> B(cast<Instruction>(I));
3822	auto ShrinkOperand = [&](Value V) -> Value {
3823	if (auto *ZI = dyn_cast<ZExtInst>(V))
3824	if (ZI->getSrcTy() == TruncatedTy)
3825	return ZI->getOperand(0);
3826	return B.CreateZExtOrTrunc(V, TruncatedTy);
3827	};
3828
3829	// The actual instruction modification depends on the instruction type,
3830	// unfortunately.
3831	Value *NewI = nullptr;
3832	if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3833	NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3834	ShrinkOperand(BO->getOperand(1)));
3835
3836	// Any wrapping introduced by shrinking this operation shouldn't be
3837	// considered undefined behavior. So, we can't unconditionally copy
3838	// arithmetic wrapping flags to NewI.
3839	cast<BinaryOperator>(NewI)->copyIRFlags(I, /IncludeWrapFlags=/false);
3840	} else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3841	NewI =
3842	B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3843	ShrinkOperand(CI->getOperand(1)));
3844	} else if (auto *SI = dyn_cast<SelectInst>(I)) {
3845	NewI = B.CreateSelect(SI->getCondition(),
3846	ShrinkOperand(SI->getTrueValue()),
3847	ShrinkOperand(SI->getFalseValue()));
3848	} else if (auto *CI = dyn_cast<CastInst>(I)) {
3849	switch (CI->getOpcode()) {
3850	default:
3851	llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3851);
3852	case Instruction::Trunc:
3853	NewI = ShrinkOperand(CI->getOperand(0));
3854	break;
3855	case Instruction::SExt:
3856	NewI = B.CreateSExtOrTrunc(
3857	CI->getOperand(0),
3858	smallestIntegerVectorType(OriginalTy, TruncatedTy));
3859	break;
3860	case Instruction::ZExt:
3861	NewI = B.CreateZExtOrTrunc(
3862	CI->getOperand(0),
3863	smallestIntegerVectorType(OriginalTy, TruncatedTy));
3864	break;
3865	}
3866	} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3867	auto Elements0 =
3868	cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3869	auto *O0 = B.CreateZExtOrTrunc(
3870	SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3871	auto Elements1 =
3872	cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3873	auto *O1 = B.CreateZExtOrTrunc(
3874	SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3875
3876	NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3877	} else if (isa<LoadInst>(I) \|\| isa<PHINode>(I)) {
3878	// Don't do anything with the operands, just extend the result.
3879	continue;
3880	} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3881	auto Elements =
3882	cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3883	auto *O0 = B.CreateZExtOrTrunc(
3884	IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3885	auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3886	NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3887	} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3888	auto Elements =
3889	cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3890	auto *O0 = B.CreateZExtOrTrunc(
3891	EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3892	NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3893	} else {
3894	// If we don't know what to do, be conservative and don't do anything.
3895	continue;
3896	}
3897
3898	// Lastly, extend the result.
3899	NewI->takeName(cast<Instruction>(I));
3900	Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3901	I->replaceAllUsesWith(Res);
3902	cast<Instruction>(I)->eraseFromParent();
3903	Erased.insert(I);
3904	State.reset(Def, Res, Part);
3905	}
3906	}
3907
3908	// We'll have created a bunch of ZExts that are now parentless. Clean up.
3909	for (const auto &KV : Cost->getMinimalBitwidths()) {
3910	// If the value wasn't vectorized, we must maintain the original scalar
3911	// type. The absence of the value from State indicates that it
3912	// wasn't vectorized.
3913	// FIXME: Should not rely on getVPValue at this point.
3914	VPValue *Def = State.Plan->getVPValue(KV.first, true);
3915	if (!State.hasAnyVectorValue(Def))
3916	continue;
3917	for (unsigned Part = 0; Part < UF; ++Part) {
3918	Value *I = State.get(Def, Part);
3919	ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3920	if (Inst && Inst->use_empty()) {
3921	Value *NewI = Inst->getOperand(0);
3922	Inst->eraseFromParent();
3923	State.reset(Def, NewI, Part);
3924	}
3925	}
3926	}
3927	}
3928
3929	void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3930	// Insert truncates and extends for any truncated instructions as hints to
3931	// InstCombine.
3932	if (VF.isVector())
3933	truncateToMinimalBitwidths(State);
3934
3935	// Fix widened non-induction PHIs by setting up the PHI operands.
3936	if (OrigPHIsToFix.size()) {
3937	assert(EnableVPlanNativePath &&(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path" ) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3938, __extension__ __PRETTY_FUNCTION__))
3938	"Unexpected non-induction PHIs for fixup in non VPlan-native path")(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path" ) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3938, __extension__ __PRETTY_FUNCTION__));
3939	fixNonInductionPHIs(State);
3940	}
3941
3942	// At this point every instruction in the original loop is widened to a
3943	// vector form. Now we need to fix the recurrences in the loop. These PHI
3944	// nodes are currently empty because we did not want to introduce cycles.
3945	// This is the second stage of vectorizing recurrences.
3946	fixCrossIterationPHIs(State);
3947
3948	// Forget the original basic block.
3949	PSE.getSE()->forgetLoop(OrigLoop);
3950
3951	// If we inserted an edge from the middle block to the unique exit block,
3952	// update uses outside the loop (phis) to account for the newly inserted
3953	// edge.
3954	if (!Cost->requiresScalarEpilogue(VF)) {
3955	// Fix-up external users of the induction variables.
3956	for (auto &Entry : Legal->getInductionVars())
3957	fixupIVUsers(Entry.first, Entry.second,
3958	getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3959	IVEndValues[Entry.first], LoopMiddleBlock);
3960
3961	fixLCSSAPHIs(State);
3962	}
3963
3964	for (Instruction *PI : PredicatedInstructions)
3965	sinkScalarOperands(&*PI);
3966
3967	// Remove redundant induction instructions.
3968	cse(LoopVectorBody);
3969
3970	// Set/update profile weights for the vector and remainder loops as original
3971	// loop iterations are now distributed among them. Note that original loop
3972	// represented by LoopScalarBody becomes remainder loop after vectorization.
3973	//
3974	// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3975	// end up getting slightly roughened result but that should be OK since
3976	// profile is not inherently precise anyway. Note also possible bypass of
3977	// vector code caused by legality checks is ignored, assigning all the weight
3978	// to the vector loop, optimistically.
3979	//
3980	// For scalable vectorization we can't know at compile time how many iterations
3981	// of the loop are handled in one vector iteration, so instead assume a pessimistic
3982	// vscale of '1'.
3983	setProfileInfoAfterUnrolling(
3984	LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3985	LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3986	}
3987
3988	void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3989	// In order to support recurrences we need to be able to vectorize Phi nodes.
3990	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
3991	// stage #2: We now need to fix the recurrences by adding incoming edges to
3992	// the currently empty PHI nodes. At this point every instruction in the
3993	// original loop is widened to a vector form so we can use them to construct
3994	// the incoming edges.
3995	VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3996	for (VPRecipeBase &R : Header->phis()) {
3997	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3998	fixReduction(ReductionPhi, State);
3999	else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4000	fixFirstOrderRecurrence(FOR, State);
4001	}
4002	}
4003
4004	void InnerLoopVectorizer::fixFirstOrderRecurrence(
4005	VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
4006	// This is the second phase of vectorizing first-order recurrences. An
4007	// overview of the transformation is described below. Suppose we have the
4008	// following loop.
4009	//
4010	// for (int i = 0; i < n; ++i)
4011	// b[i] = a[i] - a[i - 1];
4012	//
4013	// There is a first-order recurrence on "a". For this loop, the shorthand
4014	// scalar IR looks like:
4015	//
4016	// scalar.ph:
4017	// s_init = a[-1]
4018	// br scalar.body
4019	//
4020	// scalar.body:
4021	// i = phi [0, scalar.ph], [i+1, scalar.body]
4022	// s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4023	// s2 = a[i]
4024	// b[i] = s2 - s1
4025	// br cond, scalar.body, ...
4026	//
4027	// In this example, s1 is a recurrence because it's value depends on the
4028	// previous iteration. In the first phase of vectorization, we created a
4029	// vector phi v1 for s1. We now complete the vectorization and produce the
4030	// shorthand vector IR shown below (for VF = 4, UF = 1).
4031	//
4032	// vector.ph:
4033	// v_init = vector(..., ..., ..., a[-1])
4034	// br vector.body
4035	//
4036	// vector.body
4037	// i = phi [0, vector.ph], [i+4, vector.body]
4038	// v1 = phi [v_init, vector.ph], [v2, vector.body]
4039	// v2 = a[i, i+1, i+2, i+3];
4040	// v3 = vector(v1(3), v2(0, 1, 2))
4041	// b[i, i+1, i+2, i+3] = v2 - v3
4042	// br cond, vector.body, middle.block
4043	//
4044	// middle.block:
4045	// x = v2(3)
4046	// br scalar.ph
4047	//
4048	// scalar.ph:
4049	// s_init = phi [x, middle.block], [a[-1], otherwise]
4050	// br scalar.body
4051	//
4052	// After execution completes the vector loop, we extract the next value of
4053	// the recurrence (x) to use as the initial value in the scalar loop.
4054
4055	// Extract the last vector element in the middle block. This will be the
4056	// initial value for the recurrence when jumping to the scalar loop.
4057	VPValue *PreviousDef = PhiR->getBackedgeValue();
4058	Value *Incoming = State.get(PreviousDef, UF - 1);
4059	auto *ExtractForScalar = Incoming;
4060	auto *IdxTy = Builder.getInt32Ty();
4061	if (VF.isVector()) {
4062	auto *One = ConstantInt::get(IdxTy, 1);
4063	Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4064	auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4065	auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4066	ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4067	"vector.recur.extract");
4068	}
4069	// Extract the second last element in the middle block if the
4070	// Phi is used outside the loop. We need to extract the phi itself
4071	// and not the last element (the phi update in the current iteration). This
4072	// will be the value when jumping to the exit block from the LoopMiddleBlock,
4073	// when the scalar loop is not run at all.
4074	Value *ExtractForPhiUsedOutsideLoop = nullptr;
4075	if (VF.isVector()) {
4076	auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4077	auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4078	ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4079	Incoming, Idx, "vector.recur.extract.for.phi");
4080	} else if (UF > 1)
4081	// When loop is unrolled without vectorizing, initialize
4082	// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4083	// of `Incoming`. This is analogous to the vectorized case above: extracting
4084	// the second last element when VF > 1.
4085	ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4086
4087	// Fix the initial value of the original recurrence in the scalar loop.
4088	Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4089	PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4090	auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4091	auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4092	for (auto *BB : predecessors(LoopScalarPreHeader)) {
4093	auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4094	Start->addIncoming(Incoming, BB);
4095	}
4096
4097	Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4098	Phi->setName("scalar.recur");
4099
4100	// Finally, fix users of the recurrence outside the loop. The users will need
4101	// either the last value of the scalar recurrence or the last value of the
4102	// vector recurrence we extracted in the middle block. Since the loop is in
4103	// LCSSA form, we just need to find all the phi nodes for the original scalar
4104	// recurrence in the exit block, and then add an edge for the middle block.
4105	// Note that LCSSA does not imply single entry when the original scalar loop
4106	// had multiple exiting edges (as we always run the last iteration in the
4107	// scalar epilogue); in that case, there is no edge from middle to exit and
4108	// and thus no phis which needed updated.
4109	if (!Cost->requiresScalarEpilogue(VF))
4110	for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4111	if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4112	LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4113	}
4114
4115	void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4116	VPTransformState &State) {
4117	PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4118	// Get it's reduction variable descriptor.
4119	assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi ) && "Unable to find the reduction variable") ? void ( 0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4120, __extension__ __PRETTY_FUNCTION__))
4120	"Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi ) && "Unable to find the reduction variable") ? void ( 0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4120, __extension__ __PRETTY_FUNCTION__));
4121	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4122
4123	RecurKind RK = RdxDesc.getRecurrenceKind();
4124	TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4125	Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4126	setDebugLocFromInst(ReductionStartValue);
4127
4128	VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4129	// This is the vector-clone of the value that leaves the loop.
4130	Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4131
4132	// Wrap flags are in general invalid after vectorization, clear them.
4133	clearReductionWrapFlags(RdxDesc, State);
4134
4135	// Before each round, move the insertion point right between
4136	// the PHIs and the values we are going to write.
4137	// This allows us to write both PHINodes and the extractelement
4138	// instructions.
4139	Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4140
4141	setDebugLocFromInst(LoopExitInst);
4142
4143	Type *PhiTy = OrigPhi->getType();
4144	// If tail is folded by masking, the vector value to leave the loop should be
4145	// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4146	// instead of the former. For an inloop reduction the reduction will already
4147	// be predicated, and does not need to be handled here.
4148	if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4149	for (unsigned Part = 0; Part < UF; ++Part) {
4150	Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4151	Value *Sel = nullptr;
4152	for (User *U : VecLoopExitInst->users()) {
4153	if (isa<SelectInst>(U)) {
4154	assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects" ) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4154, __extension__ __PRETTY_FUNCTION__));
4155	Sel = U;
4156	} else
4157	assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select" ) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4157, __extension__ __PRETTY_FUNCTION__));
4158	}
4159	assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select" ) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4159, __extension__ __PRETTY_FUNCTION__));
4160	State.reset(LoopExitInstDef, Sel, Part);
4161
4162	// If the target can create a predicated operator for the reduction at no
4163	// extra cost in the loop (for example a predicated vadd), it can be
4164	// cheaper for the select to remain in the loop than be sunk out of it,
4165	// and so use the select value for the phi instead of the old
4166	// LoopExitValue.
4167	if (PreferPredicatedReductionSelect \|\|
4168	TTI->preferPredicatedReductionSelect(
4169	RdxDesc.getOpcode(), PhiTy,
4170	TargetTransformInfo::ReductionFlags())) {
4171	auto *VecRdxPhi =
4172	cast<PHINode>(State.get(PhiR, Part));
4173	VecRdxPhi->setIncomingValueForBlock(
4174	LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4175	}
4176	}
4177	}
4178
4179	// If the vector reduction can be performed in a smaller type, we truncate
4180	// then extend the loop exit value to enable InstCombine to evaluate the
4181	// entire expression in the smaller type.
4182	if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4183	assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!" ) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4183, __extension__ __PRETTY_FUNCTION__));
4184	Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4185	Builder.SetInsertPoint(
4186	LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4187	VectorParts RdxParts(UF);
4188	for (unsigned Part = 0; Part < UF; ++Part) {
4189	RdxParts[Part] = State.get(LoopExitInstDef, Part);
4190	Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4191	Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4192	: Builder.CreateZExt(Trunc, VecTy);
4193	for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4194	if (U != Trunc) {
4195	U->replaceUsesOfWith(RdxParts[Part], Extnd);
4196	RdxParts[Part] = Extnd;
4197	}
4198	}
4199	Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4200	for (unsigned Part = 0; Part < UF; ++Part) {
4201	RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4202	State.reset(LoopExitInstDef, RdxParts[Part], Part);
4203	}
4204	}
4205
4206	// Reduce all of the unrolled parts into a single vector.
4207	Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4208	unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4209
4210	// The middle block terminator has already been assigned a DebugLoc here (the
4211	// OrigLoop's single latch terminator). We want the whole middle block to
4212	// appear to execute on this line because: (a) it is all compiler generated,
4213	// (b) these instructions are always executed after evaluating the latch
4214	// conditional branch, and (c) other passes may add new predecessors which
4215	// terminate on this line. This is the easiest way to ensure we don't
4216	// accidentally cause an extra step back into the loop while debugging.
4217	setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4218	if (PhiR->isOrdered())
4219	ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4220	else {
4221	// Floating-point operations should have some FMF to enable the reduction.
4222	IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4223	Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4224	for (unsigned Part = 1; Part < UF; ++Part) {
4225	Value *RdxPart = State.get(LoopExitInstDef, Part);
4226	if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4227	ReducedPartRdx = Builder.CreateBinOp(
4228	(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4229	} else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4230	ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4231	ReducedPartRdx, RdxPart);
4232	else
4233	ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4234	}
4235	}
4236
4237	// Create the reduction after the loop. Note that inloop reductions create the
4238	// target reduction in the loop using a Reduction recipe.
4239	if (VF.isVector() && !PhiR->isInLoop()) {
4240	ReducedPartRdx =
4241	createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4242	// If the reduction can be performed in a smaller type, we need to extend
4243	// the reduction to the wider type before we branch to the original loop.
4244	if (PhiTy != RdxDesc.getRecurrenceType())
4245	ReducedPartRdx = RdxDesc.isSigned()
4246	? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4247	: Builder.CreateZExt(ReducedPartRdx, PhiTy);
4248	}
4249
4250	PHINode *ResumePhi =
4251	dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4252
4253	// Create a phi node that merges control-flow from the backedge-taken check
4254	// block and the middle block.
4255	PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4256	LoopScalarPreHeader->getTerminator());
4257
4258	// If we are fixing reductions in the epilogue loop then we should already
4259	// have created a bc.merge.rdx Phi after the main vector body. Ensure that
4260	// we carry over the incoming values correctly.
4261	for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4262	if (Incoming == LoopMiddleBlock)
4263	BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4264	else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4265	BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4266	Incoming);
4267	else
4268	BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4269	}
4270
4271	// Set the resume value for this reduction
4272	ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4273
4274	// Now, we need to fix the users of the reduction variable
4275	// inside and outside of the scalar remainder loop.
4276
4277	// We know that the loop is in LCSSA form. We need to update the PHI nodes
4278	// in the exit blocks. See comment on analogous loop in
4279	// fixFirstOrderRecurrence for a more complete explaination of the logic.
4280	if (!Cost->requiresScalarEpilogue(VF))
4281	for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4282	if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4283	LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4284
4285	// Fix the scalar loop reduction variable with the incoming reduction sum
4286	// from the vector body and from the backedge value.
4287	int IncomingEdgeBlockIdx =
4288	OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4289	assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 && "Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4289, __extension__ __PRETTY_FUNCTION__));
4290	// Pick the other block.
4291	int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4292	OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4293	OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4294	}
4295
4296	void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4297	VPTransformState &State) {
4298	RecurKind RK = RdxDesc.getRecurrenceKind();
4299	if (RK != RecurKind::Add && RK != RecurKind::Mul)
4300	return;
4301
4302	Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4303	assert(LoopExitInstr && "null loop exit instruction")(static_cast <bool> (LoopExitInstr && "null loop exit instruction" ) ? void (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4303, __extension__ __PRETTY_FUNCTION__));
4304	SmallVector<Instruction *, 8> Worklist;
4305	SmallPtrSet<Instruction *, 8> Visited;
4306	Worklist.push_back(LoopExitInstr);
4307	Visited.insert(LoopExitInstr);
4308
4309	while (!Worklist.empty()) {
4310	Instruction *Cur = Worklist.pop_back_val();
4311	if (isa<OverflowingBinaryOperator>(Cur))
4312	for (unsigned Part = 0; Part < UF; ++Part) {
4313	// FIXME: Should not rely on getVPValue at this point.
4314	Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4315	cast<Instruction>(V)->dropPoisonGeneratingFlags();
4316	}
4317
4318	for (User *U : Cur->users()) {
4319	Instruction *UI = cast<Instruction>(U);
4320	if ((Cur != LoopExitInstr \|\| OrigLoop->contains(UI->getParent())) &&
4321	Visited.insert(UI).second)
4322	Worklist.push_back(UI);
4323	}
4324	}
4325	}
4326
4327	void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4328	for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4329	if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4330	// Some phis were already hand updated by the reduction and recurrence
4331	// code above, leave them alone.
4332	continue;
4333
4334	auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4335	// Non-instruction incoming values will have only one value.
4336
4337	VPLane Lane = VPLane::getFirstLane();
4338	if (isa<Instruction>(IncomingValue) &&
4339	!Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4340	VF))
4341	Lane = VPLane::getLastLaneForVF(VF);
4342
4343	// Can be a loop invariant incoming value or the last scalar value to be
4344	// extracted from the vectorized loop.
4345	// FIXME: Should not rely on getVPValue at this point.
4346	Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4347	Value *lastIncomingValue =
4348	OrigLoop->isLoopInvariant(IncomingValue)
4349	? IncomingValue
4350	: State.get(State.Plan->getVPValue(IncomingValue, true),
4351	VPIteration(UF - 1, Lane));
4352	LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4353	}
4354	}
4355
4356	void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4357	// The basic block and loop containing the predicated instruction.
4358	auto *PredBB = PredInst->getParent();
4359	auto *VectorLoop = LI->getLoopFor(PredBB);
4360
4361	// Initialize a worklist with the operands of the predicated instruction.
4362	SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4363
4364	// Holds instructions that we need to analyze again. An instruction may be
4365	// reanalyzed if we don't yet know if we can sink it or not.
4366	SmallVector<Instruction *, 8> InstsToReanalyze;
4367
4368	// Returns true if a given use occurs in the predicated block. Phi nodes use
4369	// their operands in their corresponding predecessor blocks.
4370	auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4371	auto *I = cast<Instruction>(U.getUser());
4372	BasicBlock *BB = I->getParent();
4373	if (auto *Phi = dyn_cast<PHINode>(I))
4374	BB = Phi->getIncomingBlock(
4375	PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4376	return BB == PredBB;
4377	};
4378
4379	// Iteratively sink the scalarized operands of the predicated instruction
4380	// into the block we created for it. When an instruction is sunk, it's
4381	// operands are then added to the worklist. The algorithm ends after one pass
4382	// through the worklist doesn't sink a single instruction.
4383	bool Changed;
4384	do {
4385	// Add the instructions that need to be reanalyzed to the worklist, and
4386	// reset the changed indicator.
4387	Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4388	InstsToReanalyze.clear();
4389	Changed = false;
4390
4391	while (!Worklist.empty()) {
4392	auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4393
4394	// We can't sink an instruction if it is a phi node, is not in the loop,
4395	// or may have side effects.
4396	if (!I \|\| isa<PHINode>(I) \|\| !VectorLoop->contains(I) \|\|
4397	I->mayHaveSideEffects())
4398	continue;
4399
4400	// If the instruction is already in PredBB, check if we can sink its
4401	// operands. In that case, VPlan's sinkScalarOperands() succeeded in
4402	// sinking the scalar instruction I, hence it appears in PredBB; but it
4403	// may have failed to sink I's operands (recursively), which we try
4404	// (again) here.
4405	if (I->getParent() == PredBB) {
4406	Worklist.insert(I->op_begin(), I->op_end());
4407	continue;
4408	}
4409
4410	// It's legal to sink the instruction if all its uses occur in the
4411	// predicated block. Otherwise, there's nothing to do yet, and we may
4412	// need to reanalyze the instruction.
4413	if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4414	InstsToReanalyze.push_back(I);
4415	continue;
4416	}
4417
4418	// Move the instruction to the beginning of the predicated block, and add
4419	// it's operands to the worklist.
4420	I->moveBefore(&*PredBB->getFirstInsertionPt());
4421	Worklist.insert(I->op_begin(), I->op_end());
4422
4423	// The sinking may have enabled other instructions to be sunk, so we will
4424	// need to iterate.
4425	Changed = true;
4426	}
4427	} while (Changed);
4428	}
4429
4430	void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4431	for (PHINode *OrigPhi : OrigPHIsToFix) {
4432	VPWidenPHIRecipe *VPPhi =
4433	cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4434	PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4435	// Make sure the builder has a valid insert point.
4436	Builder.SetInsertPoint(NewPhi);
4437	for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4438	VPValue *Inc = VPPhi->getIncomingValue(i);
4439	VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4440	NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4441	}
4442	}
4443	}
4444
4445	bool InnerLoopVectorizer::useOrderedReductions(
4446	const RecurrenceDescriptor &RdxDesc) {
4447	return Cost->useOrderedReductions(RdxDesc);
4448	}
4449
4450	void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4451	VPWidenPHIRecipe *PhiR,
4452	VPTransformState &State) {
4453	PHINode *P = cast<PHINode>(PN);
4454	if (EnableVPlanNativePath) {
4455	// Currently we enter here in the VPlan-native path for non-induction
4456	// PHIs where all control flow is uniform. We simply widen these PHIs.
4457	// Create a vector phi with no operands - the vector phi operands will be
4458	// set at the end of vector code generation.
4459	Type *VecTy = (State.VF.isScalar())
4460	? PN->getType()
4461	: VectorType::get(PN->getType(), State.VF);
4462	Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4463	State.set(PhiR, VecPhi, 0);
4464	OrigPHIsToFix.push_back(P);
4465
4466	return;
4467	}
4468
4469	assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop-> getHeader() && "Non-header phis should have been handled elsewhere" ) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4470, __extension__ __PRETTY_FUNCTION__))
4470	"Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop-> getHeader() && "Non-header phis should have been handled elsewhere" ) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4470, __extension__ __PRETTY_FUNCTION__));
4471
4472	// In order to support recurrences we need to be able to vectorize Phi nodes.
4473	// Phi nodes have cycles, so we need to vectorize them in two stages. This is
4474	// stage #1: We create a new vector PHI node with no incoming edges. We'll use
4475	// this value when we vectorize all of the instructions that use the PHI.
4476
4477	assert(!Legal->isReductionVariable(P) &&(static_cast <bool> (!Legal->isReductionVariable(P) && "reductions should be handled elsewhere") ? void (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4478, __extension__ __PRETTY_FUNCTION__))
4478	"reductions should be handled elsewhere")(static_cast <bool> (!Legal->isReductionVariable(P) && "reductions should be handled elsewhere") ? void (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4478, __extension__ __PRETTY_FUNCTION__));
4479
4480	setDebugLocFromInst(P);
4481
4482	// This PHINode must be an induction variable.
4483	// Make sure that we know about it.
4484	assert(Legal->getInductionVars().count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count (P) && "Not an induction variable") ? void (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4484, __extension__ __PRETTY_FUNCTION__));
4485
4486	InductionDescriptor II = Legal->getInductionVars().lookup(P);
4487	const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4488
4489	auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4490	PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4491
4492	// FIXME: The newly created binary instructions should contain nsw/nuw flags,
4493	// which can be found from the original scalar operations.
4494	switch (II.getKind()) {
4495	case InductionDescriptor::IK_NoInduction:
4496	llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4496);
4497	case InductionDescriptor::IK_IntInduction:
4498	case InductionDescriptor::IK_FpInduction:
4499	llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere." , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4499);
4500	case InductionDescriptor::IK_PtrInduction: {
4501	// Handle the pointer induction variable case.
4502	assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() && "Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4502, __extension__ __PRETTY_FUNCTION__));
4503
4504	if (Cost->isScalarAfterVectorization(P, State.VF)) {
4505	// This is the normalized GEP that starts counting at zero.
4506	Value *PtrInd =
4507	Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4508	// Determine the number of scalars we need to generate for each unroll
4509	// iteration. If the instruction is uniform, we only need to generate the
4510	// first lane. Otherwise, we generate all VF values.
4511	bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4512	assert((IsUniform \|\| !State.VF.isScalable()) &&(static_cast <bool> ((IsUniform \|\| !State.VF.isScalable ()) && "Cannot scalarize a scalable VF") ? void (0) : __assert_fail ("(IsUniform \|\| !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4513, __extension__ __PRETTY_FUNCTION__))
4513	"Cannot scalarize a scalable VF")(static_cast <bool> ((IsUniform \|\| !State.VF.isScalable ()) && "Cannot scalarize a scalable VF") ? void (0) : __assert_fail ("(IsUniform \|\| !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4513, __extension__ __PRETTY_FUNCTION__));
4514	unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4515
4516	for (unsigned Part = 0; Part < UF; ++Part) {
4517	Value *PartStart =
4518	createStepForVF(Builder, PtrInd->getType(), VF, Part);
4519
4520	for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4521	Value *Idx = Builder.CreateAdd(
4522	PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4523	Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4524	Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4525	DL, II, State.CFG.PrevBB);
4526	SclrGep->setName("next.gep");
4527	State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4528	}
4529	}
4530	return;
4531	}
4532	assert(isa<SCEVConstant>(II.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(II.getStep ()) && "Induction step not a SCEV constant!") ? void ( 0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4533, __extension__ __PRETTY_FUNCTION__))
4533	"Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(II.getStep ()) && "Induction step not a SCEV constant!") ? void ( 0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4533, __extension__ __PRETTY_FUNCTION__));
4534	Type *PhiType = II.getStep()->getType();
4535
4536	// Build a pointer phi
4537	Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4538	Type *ScStValueType = ScalarStartValue->getType();
4539	PHINode *NewPointerPhi =
4540	PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4541	NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4542
4543	// A pointer induction, performed by using a gep
4544	BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4545	Instruction *InductionLoc = LoopLatch->getTerminator();
4546	const SCEV *ScalarStep = II.getStep();
4547	SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4548	Value *ScalarStepValue =
4549	Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4550	Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4551	Value *NumUnrolledElems =
4552	Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4553	Value *InductionGEP = GetElementPtrInst::Create(
4554	II.getElementType(), NewPointerPhi,
4555	Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4556	InductionLoc);
4557	NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4558
4559	// Create UF many actual address geps that use the pointer
4560	// phi as base and a vectorized version of the step value
4561	// (<step0, ..., stepN>) as offset.
4562	for (unsigned Part = 0; Part < State.UF; ++Part) {
4563	Type *VecPhiType = VectorType::get(PhiType, State.VF);
4564	Value *StartOffsetScalar =
4565	Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4566	Value *StartOffset =
4567	Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4568	// Create a vector of consecutive numbers from zero to VF.
4569	StartOffset =
4570	Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4571
4572	Value *GEP = Builder.CreateGEP(
4573	II.getElementType(), NewPointerPhi,
4574	Builder.CreateMul(
4575	StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4576	"vector.gep"));
4577	State.set(PhiR, GEP, Part);
4578	}
4579	}
4580	}
4581	}
4582
4583	/// A helper function for checking whether an integer division-related
4584	/// instruction may divide by zero (in which case it must be predicated if
4585	/// executed conditionally in the scalar code).
4586	/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4587	/// Non-zero divisors that are non compile-time constants will not be
4588	/// converted into multiplication, so we will still end up scalarizing
4589	/// the division, but can do so w/o predication.
4590	static bool mayDivideByZero(Instruction &I) {
4591	assert((I.getOpcode() == Instruction::UDiv \|\|(static_cast <bool> ((I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction ::URem \|\| I.getOpcode() == Instruction::SRem) && "Unexpected instruction" ) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::URem \|\| I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4595, __extension__ __PRETTY_FUNCTION__))
4592	I.getOpcode() == Instruction::SDiv \|\|(static_cast <bool> ((I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction ::URem \|\| I.getOpcode() == Instruction::SRem) && "Unexpected instruction" ) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::URem \|\| I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4595, __extension__ __PRETTY_FUNCTION__))
4593	I.getOpcode() == Instruction::URem \|\|(static_cast <bool> ((I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction ::URem \|\| I.getOpcode() == Instruction::SRem) && "Unexpected instruction" ) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::URem \|\| I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4595, __extension__ __PRETTY_FUNCTION__))
4594	I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction ::URem \|\| I.getOpcode() == Instruction::SRem) && "Unexpected instruction" ) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::URem \|\| I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4595, __extension__ __PRETTY_FUNCTION__))
4595	"Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction ::URem \|\| I.getOpcode() == Instruction::SRem) && "Unexpected instruction" ) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::URem \|\| I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4595, __extension__ __PRETTY_FUNCTION__));
4596	Value *Divisor = I.getOperand(1);
4597	auto *CInt = dyn_cast<ConstantInt>(Divisor);
4598	return !CInt \|\| CInt->isZero();
4599	}
4600
4601	void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4602	VPUser &ArgOperands,
4603	VPTransformState &State) {
4604	assert(!isa<DbgInfoIntrinsic>(I) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction" ) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4605, __extension__ __PRETTY_FUNCTION__))
4605	"DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction" ) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4605, __extension__ __PRETTY_FUNCTION__));
4606	setDebugLocFromInst(&I);
4607
4608	Module *M = I.getParent()->getParent()->getParent();
4609	auto *CI = cast<CallInst>(&I);
4610
4611	SmallVector<Type *, 4> Tys;
4612	for (Value *ArgOperand : CI->args())
4613	Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4614
4615	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4616
4617	// The flag shows whether we use Intrinsic or a usual Call for vectorized
4618	// version of the instruction.
4619	// Is it beneficial to perform intrinsic call compared to lib call?
4620	bool NeedToScalarize = false;
4621	InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4622	InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4623	bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4624	assert((UseVectorIntrinsic \|\| !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic \|\| !NeedToScalarize ) && "Instruction should be scalarized elsewhere.") ? void (0) : __assert_fail ("(UseVectorIntrinsic \|\| !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4625, __extension__ __PRETTY_FUNCTION__))
4625	"Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic \|\| !NeedToScalarize ) && "Instruction should be scalarized elsewhere.") ? void (0) : __assert_fail ("(UseVectorIntrinsic \|\| !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4625, __extension__ __PRETTY_FUNCTION__));
4626	assert((IntrinsicCost.isValid() \|\| CallCost.isValid()) &&(static_cast <bool> ((IntrinsicCost.isValid() \|\| CallCost .isValid()) && "Either the intrinsic cost or vector call cost must be valid" ) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() \|\| CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4627, __extension__ __PRETTY_FUNCTION__))
4627	"Either the intrinsic cost or vector call cost must be valid")(static_cast <bool> ((IntrinsicCost.isValid() \|\| CallCost .isValid()) && "Either the intrinsic cost or vector call cost must be valid" ) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() \|\| CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4627, __extension__ __PRETTY_FUNCTION__));
4628
4629	for (unsigned Part = 0; Part < UF; ++Part) {
4630	SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4631	SmallVector<Value *, 4> Args;
4632	for (auto &I : enumerate(ArgOperands.operands())) {
4633	// Some intrinsics have a scalar argument - don't replace it with a
4634	// vector.
4635	Value *Arg;
4636	if (!UseVectorIntrinsic \|\| !hasVectorInstrinsicScalarOpd(ID, I.index()))
4637	Arg = State.get(I.value(), Part);
4638	else {
4639	Arg = State.get(I.value(), VPIteration(0, 0));
4640	if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4641	TysForDecl.push_back(Arg->getType());
4642	}
4643	Args.push_back(Arg);
4644	}
4645
4646	Function *VectorF;
4647	if (UseVectorIntrinsic) {
4648	// Use vector version of the intrinsic.
4649	if (VF.isVector())
4650	TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4651	VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4652	assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic." ) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__ __PRETTY_FUNCTION__));
4653	} else {
4654	// Use vector version of the function call.
4655	const VFShape Shape = VFShape::get(CI, VF, false /HasGlobalPred*/);
4656	#ifndef NDEBUG
4657	assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(CI).getVectorizedFunction (Shape) != nullptr && "Can't create vector function." ) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4658, __extension__ __PRETTY_FUNCTION__))
4658	"Can't create vector function.")(static_cast <bool> (VFDatabase(CI).getVectorizedFunction (Shape) != nullptr && "Can't create vector function." ) ? void (0) : __assert_fail ("VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4658, __extension__ __PRETTY_FUNCTION__));
4659	#endif
4660	VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4661	}
4662	SmallVector<OperandBundleDef, 1> OpBundles;
4663	CI->getOperandBundlesAsDefs(OpBundles);
4664	CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4665
4666	if (isa<FPMathOperator>(V))
4667	V->copyFastMathFlags(CI);
4668
4669	State.set(Def, V, Part);
4670	addMetadata(V, &I);
4671	}
4672	}
4673
4674	void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4675	// We should not collect Scalars more than once per VF. Right now, this
4676	// function is called from collectUniformsAndScalars(), which already does
4677	// this check. Collecting Scalars for VF=1 does not make any sense.
4678	assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find (VF) == Scalars.end() && "This function should not be visited twice for the same VF" ) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4679, __extension__ __PRETTY_FUNCTION__))
4679	"This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find (VF) == Scalars.end() && "This function should not be visited twice for the same VF" ) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4679, __extension__ __PRETTY_FUNCTION__));
4680
4681	SmallSetVector<Instruction *, 8> Worklist;
4682
4683	// These sets are used to seed the analysis with pointers used by memory
4684	// accesses that will remain scalar.
4685	SmallSetVector<Instruction *, 8> ScalarPtrs;
4686	SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4687	auto *Latch = TheLoop->getLoopLatch();
4688
4689	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
4690	// The pointer operands of loads and stores will be scalar as long as the
4691	// memory access is not a gather or scatter operation. The value operand of a
4692	// store will remain scalar if the store is scalarized.
4693	auto isScalarUse = [&](Instruction MemAccess, Value Ptr) {
4694	InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4695	assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown && "Widening decision should be ready at this moment") ? void ( 0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4696, __extension__ __PRETTY_FUNCTION__))
4696	"Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown && "Widening decision should be ready at this moment") ? void ( 0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4696, __extension__ __PRETTY_FUNCTION__));
4697	if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4698	if (Ptr == Store->getValueOperand())
4699	return WideningDecision == CM_Scalarize;
4700	assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand( MemAccess) && "Ptr is neither a value or pointer operand" ) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4701, __extension__ __PRETTY_FUNCTION__))
4701	"Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand( MemAccess) && "Ptr is neither a value or pointer operand" ) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4701, __extension__ __PRETTY_FUNCTION__));
4702	return WideningDecision != CM_GatherScatter;
4703	};
4704
4705	// A helper that returns true if the given value is a bitcast or
4706	// getelementptr instruction contained in the loop.
4707	auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4708	return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) \|\|
4709	isa<GetElementPtrInst>(V)) &&
4710	!TheLoop->isLoopInvariant(V);
4711	};
4712
4713	// A helper that evaluates a memory access's use of a pointer. If the use will
4714	// be a scalar use and the pointer is only used by memory accesses, we place
4715	// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4716	// PossibleNonScalarPtrs.
4717	auto evaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
4718	// We only care about bitcast and getelementptr instructions contained in
4719	// the loop.
4720	if (!isLoopVaryingBitCastOrGEP(Ptr))
4721	return;
4722
4723	// If the pointer has already been identified as scalar (e.g., if it was
4724	// also identified as uniform), there's nothing to do.
4725	auto *I = cast<Instruction>(Ptr);
4726	if (Worklist.count(I))
4727	return;
4728
4729	// If the use of the pointer will be a scalar use, and all users of the
4730	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4731	// place the pointer in PossibleNonScalarPtrs.
4732	if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4733	return isa<LoadInst>(U) \|\| isa<StoreInst>(U);
4734	}))
4735	ScalarPtrs.insert(I);
4736	else
4737	PossibleNonScalarPtrs.insert(I);
4738	};
4739
4740	// We seed the scalars analysis with three classes of instructions: (1)
4741	// instructions marked uniform-after-vectorization and (2) bitcast,
4742	// getelementptr and (pointer) phi instructions used by memory accesses
4743	// requiring a scalar use.
4744	//
4745	// (1) Add to the worklist all instructions that have been identified as
4746	// uniform-after-vectorization.
4747	Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4748
4749	// (2) Add to the worklist all bitcast and getelementptr instructions used by
4750	// memory accesses requiring a scalar use. The pointer operands of loads and
4751	// stores will be scalar as long as the memory accesses is not a gather or
4752	// scatter operation. The value operand of a store will remain scalar if the
4753	// store is scalarized.
4754	for (auto *BB : TheLoop->blocks())
4755	for (auto &I : *BB) {
4756	if (auto *Load = dyn_cast<LoadInst>(&I)) {
4757	evaluatePtrUse(Load, Load->getPointerOperand());
4758	} else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4759	evaluatePtrUse(Store, Store->getPointerOperand());
4760	evaluatePtrUse(Store, Store->getValueOperand());
4761	}
4762	}
4763	for (auto *I : ScalarPtrs)
4764	if (!PossibleNonScalarPtrs.count(I)) {
4765	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: " << I << "\n"; } } while (false);
4766	Worklist.insert(I);
4767	}
4768
4769	// Insert the forced scalars.
4770	// FIXME: Currently widenPHIInstruction() often creates a dead vector
4771	// induction variable when the PHI user is scalarized.
4772	auto ForcedScalar = ForcedScalars.find(VF);
4773	if (ForcedScalar != ForcedScalars.end())
4774	for (auto *I : ForcedScalar->second)
4775	Worklist.insert(I);
4776
4777	// Expand the worklist by looking through any bitcasts and getelementptr
4778	// instructions we've already identified as scalar. This is similar to the
4779	// expansion step in collectLoopUniforms(); however, here we're only
4780	// expanding to include additional bitcasts and getelementptr instructions.
4781	unsigned Idx = 0;
4782	while (Idx != Worklist.size()) {
4783	Instruction *Dst = Worklist[Idx++];
4784	if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4785	continue;
4786	auto *Src = cast<Instruction>(Dst->getOperand(0));
4787	if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4788	auto *J = cast<Instruction>(U);
4789	return !TheLoop->contains(J) \|\| Worklist.count(J) \|\|
4790	((isa<LoadInst>(J) \|\| isa<StoreInst>(J)) &&
4791	isScalarUse(J, Src));
4792	})) {
4793	Worklist.insert(Src);
4794	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: " << Src << "\n"; } } while (false);
4795	}
4796	}
4797
4798	// An induction variable will remain scalar if all users of the induction
4799	// variable and induction variable update remain scalar.
4800	for (auto &Induction : Legal->getInductionVars()) {
4801	auto *Ind = Induction.first;
4802	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4803
4804	// If tail-folding is applied, the primary induction variable will be used
4805	// to feed a vector compare.
4806	if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4807	continue;
4808
4809	// Returns true if \p Indvar is a pointer induction that is used directly by
4810	// load/store instruction \p I.
4811	auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4812	Instruction *I) {
4813	return Induction.second.getKind() ==
4814	InductionDescriptor::IK_PtrInduction &&
4815	(isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
4816	Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4817	};
4818
4819	// Determine if all users of the induction variable are scalar after
4820	// vectorization.
4821	auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4822	auto *I = cast<Instruction>(U);
4823	return I == IndUpdate \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
4824	IsDirectLoadStoreFromPtrIndvar(Ind, I);
4825	});
4826	if (!ScalarInd)
4827	continue;
4828
4829	// Determine if all users of the induction variable update instruction are
4830	// scalar after vectorization.
4831	auto ScalarIndUpdate =
4832	llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4833	auto *I = cast<Instruction>(U);
4834	return I == Ind \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
4835	IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4836	});
4837	if (!ScalarIndUpdate)
4838	continue;
4839
4840	// The induction variable and its update instruction will remain scalar.
4841	Worklist.insert(Ind);
4842	Worklist.insert(IndUpdate);
4843	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: " << Ind << "\n"; } } while (false);
4844	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: " << IndUpdate << "\n"; } } while (false)
4845	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"; } } while (false);
4846	}
4847
4848	Scalars[VF].insert(Worklist.begin(), Worklist.end());
4849	}
4850
4851	bool LoopVectorizationCostModel::isScalarWithPredication(
4852	Instruction *I, ElementCount VF) const {
4853	if (!blockNeedsPredicationForAnyReason(I->getParent()))
4854	return false;
4855	switch(I->getOpcode()) {
4856	default:
4857	break;
4858	case Instruction::Load:
4859	case Instruction::Store: {
4860	if (!Legal->isMaskRequired(I))
4861	return false;
4862	auto *Ptr = getLoadStorePointerOperand(I);
4863	auto *Ty = getLoadStoreType(I);
4864	Type *VTy = Ty;
4865	if (VF.isVector())
4866	VTy = VectorType::get(Ty, VF);
4867	const Align Alignment = getLoadStoreAlignment(I);
4868	return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) \|\|
4869	TTI.isLegalMaskedGather(VTy, Alignment))
4870	: !(isLegalMaskedStore(Ty, Ptr, Alignment) \|\|
4871	TTI.isLegalMaskedScatter(VTy, Alignment));
4872	}
4873	case Instruction::UDiv:
4874	case Instruction::SDiv:
4875	case Instruction::SRem:
4876	case Instruction::URem:
4877	return mayDivideByZero(*I);
4878	}
4879	return false;
4880	}
4881
4882	bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4883	Instruction *I, ElementCount VF) {
4884	assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access." ) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4884, __extension__ __PRETTY_FUNCTION__));
4885	assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet.") ? void (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4886, __extension__ __PRETTY_FUNCTION__))
4886	"Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet.") ? void (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4886, __extension__ __PRETTY_FUNCTION__));
4887	auto *Group = getInterleavedAccessGroup(I);
4888	assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group." ) ? void (0) : __assert_fail ("Group && \"Must have a group.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4888, __extension__ __PRETTY_FUNCTION__));
4889
4890	// If the instruction's allocated size doesn't equal it's type size, it
4891	// requires padding and will be scalarized.
4892	auto &DL = I->getModule()->getDataLayout();
4893	auto *ScalarTy = getLoadStoreType(I);
4894	if (hasIrregularType(ScalarTy, DL))
4895	return false;
4896
4897	// Check if masking is required.
4898	// A Group may need masking for one of two reasons: it resides in a block that
4899	// needs predication, or it was decided to use masking to deal with gaps
4900	// (either a gap at the end of a load-access that may result in a speculative
4901	// load, or any gaps in a store-access).
4902	bool PredicatedAccessRequiresMasking =
4903	blockNeedsPredicationForAnyReason(I->getParent()) &&
4904	Legal->isMaskRequired(I);
4905	bool LoadAccessWithGapsRequiresEpilogMasking =
4906	isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4907	!isScalarEpilogueAllowed();
4908	bool StoreAccessWithGapsRequiresMasking =
4909	isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4910	if (!PredicatedAccessRequiresMasking &&
4911	!LoadAccessWithGapsRequiresEpilogMasking &&
4912	!StoreAccessWithGapsRequiresMasking)
4913	return true;
4914
4915	// If masked interleaving is required, we expect that the user/target had
4916	// enabled it, because otherwise it either wouldn't have been created or
4917	// it should have been invalidated by the CostModel.
4918	assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled." ) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4919, __extension__ __PRETTY_FUNCTION__))
4919	"Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled." ) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4919, __extension__ __PRETTY_FUNCTION__));
4920
4921	if (Group->isReverse())
4922	return false;
4923
4924	auto *Ty = getLoadStoreType(I);
4925	const Align Alignment = getLoadStoreAlignment(I);
4926	return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4927	: TTI.isLegalMaskedStore(Ty, Alignment);
4928	}
4929
4930	bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4931	Instruction *I, ElementCount VF) {
4932	// Get and ensure we have a valid memory instruction.
4933	assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I) ) && "Invalid memory instruction") ? void (0) : __assert_fail ("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4933, __extension__ __PRETTY_FUNCTION__));
4934
4935	auto *Ptr = getLoadStorePointerOperand(I);
4936	auto *ScalarTy = getLoadStoreType(I);
4937
4938	// In order to be widened, the pointer should be consecutive, first of all.
4939	if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4940	return false;
4941
4942	// If the instruction is a store located in a predicated block, it will be
4943	// scalarized.
4944	if (isScalarWithPredication(I, VF))
4945	return false;
4946
4947	// If the instruction's allocated size doesn't equal it's type size, it
4948	// requires padding and will be scalarized.
4949	auto &DL = I->getModule()->getDataLayout();
4950	if (hasIrregularType(ScalarTy, DL))
4951	return false;
4952
4953	return true;
4954	}
4955
4956	void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4957	// We should not collect Uniforms more than once per VF. Right now,
4958	// this function is called from collectUniformsAndScalars(), which
4959	// already does this check. Collecting Uniforms for VF=1 does not make any
4960	// sense.
4961
4962	assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms. find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF" ) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4963, __extension__ __PRETTY_FUNCTION__))
4963	"This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms. find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF" ) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4963, __extension__ __PRETTY_FUNCTION__));
4964
4965	// Visit the list of Uniforms. If we'll not find any uniform value, we'll
4966	// not analyze again. Uniforms.count(VF) will return 1.
4967	Uniforms[VF].clear();
4968
4969	// We now know that the loop is vectorizable!
4970	// Collect instructions inside the loop that will remain uniform after
4971	// vectorization.
4972
4973	// Global values, params and instructions outside of current loop are out of
4974	// scope.
4975	auto isOutOfScope = [&](Value *V) -> bool {
4976	Instruction *I = dyn_cast<Instruction>(V);
4977	return (!I \|\| !TheLoop->contains(I));
4978	};
4979
4980	// Worklist containing uniform instructions demanding lane 0.
4981	SetVector<Instruction *> Worklist;
4982	BasicBlock *Latch = TheLoop->getLoopLatch();
4983
4984	// Add uniform instructions demanding lane 0 to the worklist. Instructions
4985	// that are scalar with predication must not be considered uniform after
4986	// vectorization, because that would create an erroneous replicating region
4987	// where only a single instance out of VF should be formed.
4988	// TODO: optimize such seldom cases if found important, see PR40816.
4989	auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4990	if (isOutOfScope(I)) {
4991	LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: " << *I << "\n"; } } while (false)
4992	<< I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: " << I << "\n"; } } while (false);
4993	return;
4994	}
4995	if (isScalarWithPredication(I, VF)) {
4996	LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"; } } while (false)
4997	<< I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: " << I << "\n"; } } while (false);
4998	return;
4999	}
5000	LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: " << I << "\n"; } } while (false);
5001	Worklist.insert(I);
5002	};
5003
5004	// Start with the conditional branch. If the branch condition is an
5005	// instruction contained in the loop that is only used by the branch, it is
5006	// uniform.
5007	auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5008	if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5009	addToWorklistIfAllowed(Cmp);
5010
5011	auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5012	InstWidening WideningDecision = getWideningDecision(I, VF);
5013	assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown && "Widening decision should be ready at this moment") ? void ( 0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5014, __extension__ __PRETTY_FUNCTION__))
5014	"Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown && "Widening decision should be ready at this moment") ? void ( 0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5014, __extension__ __PRETTY_FUNCTION__));
5015
5016	// A uniform memory op is itself uniform. We exclude uniform stores
5017	// here as they demand the last lane, not the first one.
5018	if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5019	assert(WideningDecision == CM_Scalarize)(static_cast <bool> (WideningDecision == CM_Scalarize) ? void (0) : __assert_fail ("WideningDecision == CM_Scalarize" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5019, __extension__ __PRETTY_FUNCTION__));
5020	return true;
5021	}
5022
5023	return (WideningDecision == CM_Widen \|\|
5024	WideningDecision == CM_Widen_Reverse \|\|
5025	WideningDecision == CM_Interleave);
5026	};
5027
5028
5029	// Returns true if Ptr is the pointer operand of a memory access instruction
5030	// I, and I is known to not require scalarization.
5031	auto isVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
5032	return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5033	};
5034
5035	// Holds a list of values which are known to have at least one uniform use.
5036	// Note that there may be other uses which aren't uniform. A "uniform use"
5037	// here is something which only demands lane 0 of the unrolled iterations;
5038	// it does not imply that all lanes produce the same value (e.g. this is not
5039	// the usual meaning of uniform)
5040	SetVector<Value *> HasUniformUse;
5041
5042	// Scan the loop for instructions which are either a) known to have only
5043	// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5044	for (auto *BB : TheLoop->blocks())
5045	for (auto &I : *BB) {
5046	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5047	switch (II->getIntrinsicID()) {
5048	case Intrinsic::sideeffect:
5049	case Intrinsic::experimental_noalias_scope_decl:
5050	case Intrinsic::assume:
5051	case Intrinsic::lifetime_start:
5052	case Intrinsic::lifetime_end:
5053	if (TheLoop->hasLoopInvariantOperands(&I))
5054	addToWorklistIfAllowed(&I);
5055	break;
5056	default:
5057	break;
5058	}
5059	}
5060
5061	// ExtractValue instructions must be uniform, because the operands are
5062	// known to be loop-invariant.
5063	if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5064	assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand ()) && "Expected aggregate value to be loop invariant" ) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5065, __extension__ __PRETTY_FUNCTION__))
5065	"Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand ()) && "Expected aggregate value to be loop invariant" ) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5065, __extension__ __PRETTY_FUNCTION__));
5066	addToWorklistIfAllowed(EVI);
5067	continue;
5068	}
5069
5070	// If there's no pointer operand, there's nothing to do.
5071	auto *Ptr = getLoadStorePointerOperand(&I);
5072	if (!Ptr)
5073	continue;
5074
5075	// A uniform memory op is itself uniform. We exclude uniform stores
5076	// here as they demand the last lane, not the first one.
5077	if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5078	addToWorklistIfAllowed(&I);
5079
5080	if (isUniformDecision(&I, VF)) {
5081	assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr ) && "consistency check") ? void (0) : __assert_fail ( "isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5081, __extension__ __PRETTY_FUNCTION__));
5082	HasUniformUse.insert(Ptr);
5083	}
5084	}
5085
5086	// Add to the worklist any operands which have only uniform (e.g. lane 0
5087	// demanding) users. Since loops are assumed to be in LCSSA form, this
5088	// disallows uses outside the loop as well.
5089	for (auto *V : HasUniformUse) {
5090	if (isOutOfScope(V))
5091	continue;
5092	auto *I = cast<Instruction>(V);
5093	auto UsersAreMemAccesses =
5094	llvm::all_of(I->users(), [&](User *U) -> bool {
5095	return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5096	});
5097	if (UsersAreMemAccesses)
5098	addToWorklistIfAllowed(I);
5099	}
5100
5101	// Expand Worklist in topological order: whenever a new instruction
5102	// is added , its users should be already inside Worklist. It ensures
5103	// a uniform instruction will only be used by uniform instructions.
5104	unsigned idx = 0;
5105	while (idx != Worklist.size()) {
5106	Instruction *I = Worklist[idx++];
5107
5108	for (auto OV : I->operand_values()) {
5109	// isOutOfScope operands cannot be uniform instructions.
5110	if (isOutOfScope(OV))
5111	continue;
5112	// First order recurrence Phi's should typically be considered
5113	// non-uniform.
5114	auto *OP = dyn_cast<PHINode>(OV);
5115	if (OP && Legal->isFirstOrderRecurrence(OP))
5116	continue;
5117	// If all the users of the operand are uniform, then add the
5118	// operand into the uniform worklist.
5119	auto *OI = cast<Instruction>(OV);
5120	if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5121	auto *J = cast<Instruction>(U);
5122	return Worklist.count(J) \|\| isVectorizedMemAccessUse(J, OI);
5123	}))
5124	addToWorklistIfAllowed(OI);
5125	}
5126	}
5127
5128	// For an instruction to be added into Worklist above, all its users inside
5129	// the loop should also be in Worklist. However, this condition cannot be
5130	// true for phi nodes that form a cyclic dependence. We must process phi
5131	// nodes separately. An induction variable will remain uniform if all users
5132	// of the induction variable and induction variable update remain uniform.
5133	// The code below handles both pointer and non-pointer induction variables.
5134	for (auto &Induction : Legal->getInductionVars()) {
5135	auto *Ind = Induction.first;
5136	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5137
5138	// Determine if all users of the induction variable are uniform after
5139	// vectorization.
5140	auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5141	auto *I = cast<Instruction>(U);
5142	return I == IndUpdate \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
5143	isVectorizedMemAccessUse(I, Ind);
5144	});
5145	if (!UniformInd)
5146	continue;
5147
5148	// Determine if all users of the induction variable update instruction are
5149	// uniform after vectorization.
5150	auto UniformIndUpdate =
5151	llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5152	auto *I = cast<Instruction>(U);
5153	return I == Ind \|\| !TheLoop->contains(I) \|\| Worklist.count(I) \|\|
5154	isVectorizedMemAccessUse(I, IndUpdate);
5155	});
5156	if (!UniformIndUpdate)
5157	continue;
5158
5159	// The induction variable and its update instruction will remain uniform.
5160	addToWorklistIfAllowed(Ind);
5161	addToWorklistIfAllowed(IndUpdate);
5162	}
5163
5164	Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5165	}
5166
5167	bool LoopVectorizationCostModel::runtimeChecksRequired() {
5168	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n" ; } } while (false);
5169
5170	if (Legal->getRuntimePointerChecking()->Need) {
5171	reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5172	"runtime pointer checks needed. Enable vectorization of this "
5173	"loop with '#pragma clang loop vectorize(enable)' when "
5174	"compiling with -Os/-Oz",
5175	"CantVersionLoopWithOptForSize", ORE, TheLoop);
5176	return true;
5177	}
5178
5179	if (!PSE.getUnionPredicate().getPredicates().empty()) {
5180	reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5181	"runtime SCEV checks needed. Enable vectorization of this "
5182	"loop with '#pragma clang loop vectorize(enable)' when "
5183	"compiling with -Os/-Oz",
5184	"CantVersionLoopWithOptForSize", ORE, TheLoop);
5185	return true;
5186	}
5187
5188	// FIXME: Avoid specializing for stride==1 instead of bailing out.
5189	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5190	reportVectorizationFailure("Runtime stride check for small trip count",
5191	"runtime stride == 1 checks needed. Enable vectorization of "
5192	"this loop without such check by compiling with -Os/-Oz",
5193	"CantVersionLoopWithOptForSize", ORE, TheLoop);
5194	return true;
5195	}
5196
5197	return false;
5198	}
5199
5200	ElementCount
5201	LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5202	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5203	return ElementCount::getScalable(0);
5204
5205	if (Hints->isScalableVectorizationDisabled()) {
5206	reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5207	"ScalableVectorizationDisabled", ORE, TheLoop);
5208	return ElementCount::getScalable(0);
5209	}
5210
5211	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n" ; } } while (false);
5212
5213	auto MaxScalableVF = ElementCount::getScalable(
5214	std::numeric_limits<ElementCount::ScalarTy>::max());
5215
5216	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
5217	// FIXME: While for scalable vectors this is currently sufficient, this should
5218	// be replaced by a more detailed mechanism that filters out specific VFs,
5219	// instead of invalidating vectorization for a whole set of VFs based on the
5220	// MaxVF.
5221
5222	// Disable scalable vectorization if the loop contains unsupported reductions.
5223	if (!canVectorizeReductions(MaxScalableVF)) {
5224	reportVectorizationInfo(
5225	"Scalable vectorization not supported for the reduction "
5226	"operations found in this loop.",
5227	"ScalableVFUnfeasible", ORE, TheLoop);
5228	return ElementCount::getScalable(0);
5229	}
5230
5231	// Disable scalable vectorization if the loop contains any instructions
5232	// with element types not supported for scalable vectors.
5233	if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5234	return !Ty->isVoidTy() &&
5235	!this->TTI.isElementTypeLegalForScalableVector(Ty);
5236	})) {
5237	reportVectorizationInfo("Scalable vectorization is not supported "
5238	"for all element types found in this loop.",
5239	"ScalableVFUnfeasible", ORE, TheLoop);
5240	return ElementCount::getScalable(0);
5241	}
5242
5243	if (Legal->isSafeForAnyVectorWidth())
5244	return MaxScalableVF;
5245
5246	// Limit MaxScalableVF by the maximum safe dependence distance.
5247	Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5248	if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5249	MaxVScale =
5250	TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5251	MaxScalableVF = ElementCount::getScalable(
5252	MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5253	if (!MaxScalableVF)
5254	reportVectorizationInfo(
5255	"Max legal vector width too small, scalable vectorization "
5256	"unfeasible.",
5257	"ScalableVFUnfeasible", ORE, TheLoop);
5258
5259	return MaxScalableVF;
5260	}
5261
5262	FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5263	unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5264	MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5265	unsigned SmallestType, WidestType;
5266	std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5267
5268	// Get the maximum safe dependence distance in bits computed by LAA.
5269	// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5270	// the memory accesses that is most restrictive (involved in the smallest
5271	// dependence distance).
5272	unsigned MaxSafeElements =
5273	PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5274
5275	auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5276	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5277
5278	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"; } } while (false)
5279	<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"; } } while (false);
5280	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF << ".\n"; } } while (false)
5281	<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF << ".\n"; } } while (false);
5282
5283	// First analyze the UserVF, fall back if the UserVF should be ignored.
5284	if (UserVF) {
5285	auto MaxSafeUserVF =
5286	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5287
5288	if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5289	// If `VF=vscale x N` is safe, then so is `VF=N`
5290	if (UserVF.isScalable())
5291	return FixedScalableVFPair(
5292	ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5293	else
5294	return UserVF;
5295	}
5296
5297	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF )) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5297, __extension__ __PRETTY_FUNCTION__));
5298
5299	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5300	// is better to ignore the hint and let the compiler choose a suitable VF.
5301	if (!UserVF.isScalable()) {
5302	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is unsafe, clamping to max safe VF=" << MaxSafeFixedVF << ".\n"; } } while (false)
5303	<< " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is unsafe, clamping to max safe VF=" << MaxSafeFixedVF << ".\n"; } } while (false)
5304	<< MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is unsafe, clamping to max safe VF=" << MaxSafeFixedVF << ".\n"; } } while (false);
5305	ORE->emit([&]() {
5306	return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5307	TheLoop->getStartLoc(),
5308	TheLoop->getHeader())
5309	<< "User-specified vectorization factor "
5310	<< ore::NV("UserVectorizationFactor", UserVF)
5311	<< " is unsafe, clamping to maximum safe vectorization factor "
5312	<< ore::NV("VectorizationFactor", MaxSafeFixedVF);
5313	});
5314	return MaxSafeFixedVF;
5315	}
5316
5317	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5318	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is ignored because scalable vectors are not " "available.\n"; } } while (false)
5319	<< " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is ignored because scalable vectors are not " "available.\n"; } } while (false)
5320	"available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is ignored because scalable vectors are not " "available.\n"; } } while (false);
5321	ORE->emit([&]() {
5322	return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5323	TheLoop->getStartLoc(),
5324	TheLoop->getHeader())
5325	<< "User-specified vectorization factor "
5326	<< ore::NV("UserVectorizationFactor", UserVF)
5327	<< " is ignored because the target does not support scalable "
5328	"vectors. The compiler will pick a more suitable value.";
5329	});
5330	} else {
5331	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is unsafe. Ignoring scalable UserVF.\n"; } } while (false)
5332	<< " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: User VF=" << UserVF << " is unsafe. Ignoring scalable UserVF.\n"; } } while (false);
5333	ORE->emit([&]() {
5334	return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5335	TheLoop->getStartLoc(),
5336	TheLoop->getHeader())
5337	<< "User-specified vectorization factor "
5338	<< ore::NV("UserVectorizationFactor", UserVF)
5339	<< " is unsafe. Ignoring the hint to let the compiler pick a "
5340	"more suitable value.";
5341	});
5342	}
5343	}
5344
5345	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"; } } while (false)
5346	<< " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " << WidestType << " bits.\n"; } } while (false);
5347
5348	FixedScalableVFPair Result(ElementCount::getFixed(1),
5349	ElementCount::getScalable(0));
5350	if (auto MaxVF =
5351	getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5352	MaxSafeFixedVF, FoldTailByMasking))
5353	Result.FixedVF = MaxVF;
5354
5355	if (auto MaxVF =
5356	getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5357	MaxSafeScalableVF, FoldTailByMasking))
5358	if (MaxVF.isScalable()) {
5359	Result.ScalableVF = MaxVF;
5360	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"; } } while (false)
5361	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = " << MaxVF << "\n"; } } while (false);
5362	}
5363
5364	return Result;
5365	}
5366
5367	FixedScalableVFPair
5368	LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5369	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5370	// TODO: It may by useful to do since it's still likely to be dynamically
5371	// uniform if the target can skip.
5372	reportVectorizationFailure(
5373	"Not inserting runtime ptr check for divergent target",
5374	"runtime pointer checks needed. Not enabled for divergent target",
5375	"CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5376	return FixedScalableVFPair::getNone();
5377	}
5378
5379	unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5380	LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found trip count: " << TC << '\n'; } } while (false);
5381	if (TC == 1) {
5382	reportVectorizationFailure("Single iteration (non) loop",
5383	"loop trip count is one, irrelevant for vectorization",
5384	"SingleIterationLoop", ORE, TheLoop);
5385	return FixedScalableVFPair::getNone();
5386	}
5387
5388	switch (ScalarEpilogueStatus) {
5389	case CM_ScalarEpilogueAllowed:
5390	return computeFeasibleMaxVF(TC, UserVF, false);
5391	case CM_ScalarEpilogueNotAllowedUsePredicate:
5392	LLVM_FALLTHROUGH[[gnu::fallthrough]];
5393	case CM_ScalarEpilogueNotNeededUsePredicate:
5394	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"; } } while (false)
5395	dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"; } } while (false)
5396	<< "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"; } } while (false)
5397	<< "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"; } } while (false);
5398	break;
5399	case CM_ScalarEpilogueNotAllowedLowTripLoop:
5400	// fallthrough as a special case of OptForSize
5401	case CM_ScalarEpilogueNotAllowedOptSize:
5402	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5403	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n" ; } } while (false)
5404	dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n" ; } } while (false);
5405	else
5406	LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip " << "count.\n"; } } while (false)
5407	<< "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip " << "count.\n"; } } while (false);
5408
5409	// Bail if runtime checks are required, which are not good when optimising
5410	// for size.
5411	if (runtimeChecksRequired())
5412	return FixedScalableVFPair::getNone();
5413
5414	break;
5415	}
5416
5417	// The only loops we can vectorize without a scalar epilogue, are loops with
5418	// a bottom-test and a single exiting block. We'd have to handle the fact
5419	// that not every instruction executes on the last iteration. This will
5420	// require a lane mask which varies through the vector loop body. (TODO)
5421	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5422	// If there was a tail-folding hint/switch, but we can't fold the tail by
5423	// masking, fallback to a vectorization with a scalar epilogue.
5424	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5425	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"; } } while (false)
5426	"scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"; } } while (false);
5427	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5428	return computeFeasibleMaxVF(TC, UserVF, false);
5429	}
5430	return FixedScalableVFPair::getNone();
5431	}
5432
5433	// Now try the tail folding
5434
5435	// Invalidate interleave groups that require an epilogue if we can't mask
5436	// the interleave-group.
5437	if (!useMaskedInterleavedAccesses(TTI)) {
5438	assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point" ) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5439, __extension__ __PRETTY_FUNCTION__))
5439	"No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point" ) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5439, __extension__ __PRETTY_FUNCTION__));
5440	// Note: There is no need to invalidate any cost modeling decisions here, as
5441	// non where taken so far.
5442	InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5443	}
5444
5445	FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5446	// Avoid tail folding if the trip count is known to be a multiple of any VF
5447	// we chose.
5448	// FIXME: The condition below pessimises the case for fixed-width vectors,
5449	// when scalable VFs are also candidates for vectorization.
5450	if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5451	ElementCount MaxFixedVF = MaxFactors.FixedVF;
5452	assert((UserVF.isNonZero() \|\| isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() \|\| isPowerOf2_32 (MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2" ) ? void (0) : __assert_fail ("(UserVF.isNonZero() \|\| isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5453, __extension__ __PRETTY_FUNCTION__))
5453	"MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() \|\| isPowerOf2_32 (MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2" ) ? void (0) : __assert_fail ("(UserVF.isNonZero() \|\| isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5453, __extension__ __PRETTY_FUNCTION__));
5454	unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5455	: MaxFixedVF.getFixedValue();
5456	ScalarEvolution *SE = PSE.getSE();
5457	const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5458	const SCEV *ExitCount = SE->getAddExpr(
5459	BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5460	const SCEV *Rem = SE->getURemExpr(
5461	SE->applyLoopGuards(ExitCount, TheLoop),
5462	SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5463	if (Rem->isZero()) {
5464	// Accept MaxFixedVF if we do not have a tail.
5465	LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n" ; } } while (false);
5466	return MaxFactors;
5467	}
5468	}
5469
5470	// For scalable vectors don't use tail folding for low trip counts or
5471	// optimizing for code size. We only permit this if the user has explicitly
5472	// requested it.
5473	if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5474	ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5475	MaxFactors.ScalableVF.isVector())
5476	MaxFactors.ScalableVF = ElementCount::getScalable(0);
5477
5478	// If we don't know the precise trip count, or if the trip count that we
5479	// found modulo the vectorization factor is not zero, try to fold the tail
5480	// by masking.
5481	// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5482	if (Legal->prepareToFoldTailByMasking()) {
5483	FoldTailByMasking = true;
5484	return MaxFactors;
5485	}
5486
5487	// If there was a tail-folding hint/switch, but we can't fold the tail by
5488	// masking, fallback to a vectorization with a scalar epilogue.
5489	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5490	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"; } } while (false)
5491	"scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"; } } while (false);
5492	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5493	return MaxFactors;
5494	}
5495
5496	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5497	LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n" ; } } while (false);
5498	return FixedScalableVFPair::getNone();
5499	}
5500
5501	if (TC == 0) {
5502	reportVectorizationFailure(
5503	"Unable to calculate the loop count due to complex control flow",
5504	"unable to calculate the loop count due to complex control flow",
5505	"UnknownLoopCountComplexCFG", ORE, TheLoop);
5506	return FixedScalableVFPair::getNone();
5507	}
5508
5509	reportVectorizationFailure(
5510	"Cannot optimize for size and vectorize at the same time.",
5511	"cannot optimize for size and vectorize at the same time. "
5512	"Enable vectorization of this loop with '#pragma clang loop "
5513	"vectorize(enable)' when compiling with -Os/-Oz",
5514	"NoTailLoopWithOptForSize", ORE, TheLoop);
5515	return FixedScalableVFPair::getNone();
5516	}
5517
5518	ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5519	unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5520	const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5521	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5522	TypeSize WidestRegister = TTI.getRegisterBitWidth(
5523	ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5524	: TargetTransformInfo::RGK_FixedWidthVector);
5525
5526	// Convenience function to return the minimum of two ElementCounts.
5527	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5528	assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable ()) && "Scalable flags must match") ? void (0) : __assert_fail ("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5529, __extension__ __PRETTY_FUNCTION__))
5529	"Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable ()) && "Scalable flags must match") ? void (0) : __assert_fail ("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5529, __extension__ __PRETTY_FUNCTION__));
5530	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5531	};
5532
5533	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5534	// Note that both WidestRegister and WidestType may not be a powers of 2.
5535	auto MaxVectorElementCount = ElementCount::get(
5536	PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5537	ComputeScalableMaxVF);
5538	MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5539	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: " << (MaxVectorElementCount * WidestType) << " bits.\n" ; } } while (false)
5540	<< (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: " << (MaxVectorElementCount * WidestType) << " bits.\n" ; } } while (false);
5541
5542	if (!MaxVectorElementCount) {
5543	LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no " << (ComputeScalableMaxVF ? "scalable" : "fixed") << " vector registers.\n"; } } while (false)
5544	<< (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no " << (ComputeScalableMaxVF ? "scalable" : "fixed") << " vector registers.\n"; } } while (false)
5545	<< " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no " << (ComputeScalableMaxVF ? "scalable" : "fixed") << " vector registers.\n"; } } while (false);
5546	return ElementCount::getFixed(1);
5547	}
5548
5549	const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5550	if (ConstTripCount &&
5551	ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5552	(!FoldTailByMasking \|\| isPowerOf2_32(ConstTripCount))) {
5553	// If loop trip count (TC) is known at compile time there is no point in
5554	// choosing VF greater than TC (as done in the loop below). Select maximum
5555	// power of two which doesn't exceed TC.
5556	// If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5557	// when the TC is less than or equal to the known number of lanes.
5558	auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5559	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"; } } while (false)
5560	"exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"; } } while (false)
5561	<< ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"; } } while (false);
5562	return ElementCount::getFixed(ClampedConstTripCount);
5563	}
5564
5565	ElementCount MaxVF = MaxVectorElementCount;
5566	if (TTI.shouldMaximizeVectorBandwidth() \|\|
5567	(MaximizeBandwidth && isScalarEpilogueAllowed())) {
5568	auto MaxVectorElementCountMaxBW = ElementCount::get(
5569	PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5570	ComputeScalableMaxVF);
5571	MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5572
5573	// Collect all viable vectorization factors larger than the default MaxVF
5574	// (i.e. MaxVectorElementCount).
5575	SmallVector<ElementCount, 8> VFs;
5576	for (ElementCount VS = MaxVectorElementCount * 2;
5577	ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5578	VFs.push_back(VS);
5579
5580	// For each VF calculate its register usage.
5581	auto RUs = calculateRegisterUsage(VFs);
5582
5583	// Select the largest VF which doesn't require more registers than existing
5584	// ones.
5585	for (int i = RUs.size() - 1; i >= 0; --i) {
5586	bool Selected = true;
5587	for (auto &pair : RUs[i].MaxLocalUsers) {
5588	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5589	if (pair.second > TargetNumRegisters)
5590	Selected = false;
5591	}
5592	if (Selected) {
5593	MaxVF = VFs[i];
5594	break;
5595	}
5596	}
5597	if (ElementCount MinVF =
5598	TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5599	if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5600	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'; } } while (false)
5601	<< ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'; } } while (false);
5602	MaxVF = MinVF;
5603	}
5604	}
5605	}
5606	return MaxVF;
5607	}
5608
5609	Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5610	if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5611	auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5612	auto Min = Attr.getVScaleRangeMin();
5613	auto Max = Attr.getVScaleRangeMax();
5614	if (Max && Min == Max)
5615	return Max;
5616	}
5617
5618	return TTI.getVScaleForTuning();
5619	}
5620
5621	bool LoopVectorizationCostModel::isMoreProfitable(
5622	const VectorizationFactor &A, const VectorizationFactor &B) const {
5623	InstructionCost CostA = A.Cost;
5624	InstructionCost CostB = B.Cost;
5625
5626	unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5627
5628	if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5629	MaxTripCount) {
5630	// If we are folding the tail and the trip count is a known (possibly small)
5631	// constant, the trip count will be rounded up to an integer number of
5632	// iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5633	// which we compare directly. When not folding the tail, the total cost will
5634	// be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5635	// approximated with the per-lane cost below instead of using the tripcount
5636	// as here.
5637	auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5638	auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5639	return RTCostA < RTCostB;
5640	}
5641
5642	// Improve estimate for the vector width if it is scalable.
5643	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5644	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5645	if (Optional<unsigned> VScale = getVScaleForTuning()) {
5646	if (A.Width.isScalable())
5647	EstimatedWidthA *= VScale.getValue();
5648	if (B.Width.isScalable())
5649	EstimatedWidthB *= VScale.getValue();
5650	}
5651
5652	// Assume vscale may be larger than 1 (or the value being tuned for),
5653	// so that scalable vectorization is slightly favorable over fixed-width
5654	// vectorization.
5655	if (A.Width.isScalable() && !B.Width.isScalable())
5656	return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5657
5658	// To avoid the need for FP division:
5659	// (CostA / A.Width) < (CostB / B.Width)
5660	// <=> (CostA * B.Width) < (CostB * A.Width)
5661	return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5662	}
5663
5664	VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5665	const ElementCountSet &VFCandidates) {
5666	InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5667	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"; } } while (false);
5668	assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop" ) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5668, __extension__ __PRETTY_FUNCTION__));
5669	assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed (1)) && "Expected Scalar VF to be a candidate") ? void (0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5670, __extension__ __PRETTY_FUNCTION__))
5670	"Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed (1)) && "Expected Scalar VF to be a candidate") ? void (0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5670, __extension__ __PRETTY_FUNCTION__));
5671
5672	const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5673	VectorizationFactor ChosenFactor = ScalarCost;
5674
5675	bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5676	if (ForceVectorization && VFCandidates.size() > 1) {
5677	// Ignore scalar width, because the user explicitly wants vectorization.
5678	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
5679	// evaluation.
5680	ChosenFactor.Cost = InstructionCost::getMax();
5681	}
5682
5683	SmallVector<InstructionVFPair> InvalidCosts;
5684	for (const auto &i : VFCandidates) {
5685	// The cost for scalar VF=1 is already calculated, so ignore it.
5686	if (i.isScalar())
5687	continue;
5688
5689	VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5690	VectorizationFactor Candidate(i, C.first);
5691
5692	#ifndef NDEBUG
5693	unsigned AssumedMinimumVscale = 1;
5694	if (Optional<unsigned> VScale = getVScaleForTuning())
5695	AssumedMinimumVscale = VScale.getValue();
5696	unsigned Width =
5697	Candidate.Width.isScalable()
5698	? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5699	: Candidate.Width.getFixedValue();
5700	LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Width ); } } while (false)
5701	<< " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (Candidate.Cost / Width ); } } while (false);
5702	if (i.isScalable())
5703	LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"; } } while (false )
5704	<< AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"; } } while (false );
5705	LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << ".\n"; } } while (false );
5706	#endif
5707
5708	if (!C.second && !ForceVectorization) {
5709	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n" ; } } while (false)
5710	dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n" ; } } while (false)
5711	<< " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n" ; } } while (false);
5712	continue;
5713	}
5714
5715	// If profitable add it to ProfitableVF list.
5716	if (isMoreProfitable(Candidate, ScalarCost))
5717	ProfitableVFs.push_back(Candidate);
5718
5719	if (isMoreProfitable(Candidate, ChosenFactor))
5720	ChosenFactor = Candidate;
5721	}
5722
5723	// Emit a report of VFs with invalid costs in the loop.
5724	if (!InvalidCosts.empty()) {
5725	// Group the remarks per instruction, keeping the instruction order from
5726	// InvalidCosts.
5727	std::map<Instruction *, unsigned> Numbering;
5728	unsigned I = 0;
5729	for (auto &Pair : InvalidCosts)
5730	if (!Numbering.count(Pair.first))
5731	Numbering[Pair.first] = I++;
5732
5733	// Sort the list, first on instruction(number) then on VF.
5734	llvm::sort(InvalidCosts,
5735	[&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5736	if (Numbering[A.first] != Numbering[B.first])
5737	return Numbering[A.first] < Numbering[B.first];
5738	ElementCountComparator ECC;
5739	return ECC(A.second, B.second);
5740	});
5741
5742	// For a list of ordered instruction-vf pairs:
5743	// [(load, vf1), (load, vf2), (store, vf1)]
5744	// Group the instructions together to emit separate remarks for:
5745	// load (vf1, vf2)
5746	// store (vf1)
5747	auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5748	auto Subset = ArrayRef<InstructionVFPair>();
5749	do {
5750	if (Subset.empty())
5751	Subset = Tail.take_front(1);
5752
5753	Instruction *I = Subset.front().first;
5754
5755	// If the next instruction is different, or if there are no other pairs,
5756	// emit a remark for the collated subset. e.g.
5757	// [(load, vf1), (load, vf2))]
5758	// to emit:
5759	// remark: invalid costs for 'load' at VF=(vf, vf2)
5760	if (Subset == Tail \|\| Tail[Subset.size()].first != I) {
5761	std::string OutString;
5762	raw_string_ostream OS(OutString);
5763	assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range" ) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5763, __extension__ __PRETTY_FUNCTION__));
5764	OS << "Instruction with invalid costs prevented vectorization at VF=(";
5765	for (auto &Pair : Subset)
5766	OS << (Pair.second == Subset.front().second ? "" : ", ")
5767	<< Pair.second;
5768	OS << "):";
5769	if (auto *CI = dyn_cast<CallInst>(I))
5770	OS << " call to " << CI->getCalledFunction()->getName();
5771	else
5772	OS << " " << I->getOpcodeName();
5773	OS.flush();
5774	reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5775	Tail = Tail.drop_front(Subset.size());
5776	Subset = {};
5777	} else
5778	// Grow the subset by one element
5779	Subset = Tail.take_front(Subset.size() + 1);
5780	} while (!Tail.empty());
5781	}
5782
5783	if (!EnableCondStoresVectorization && NumPredStores) {
5784	reportVectorizationFailure("There are conditional stores.",
5785	"store that is conditionally executed prevents vectorization",
5786	"ConditionalStore", ORE, TheLoop);
5787	ChosenFactor = ScalarCost;
5788	}
5789
5790	LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && !ChosenFactor .Width.isScalar() && ChosenFactor.Cost >= ScalarCost .Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (false)
5791	ChosenFactor.Cost >= ScalarCost.Cost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && !ChosenFactor .Width.isScalar() && ChosenFactor.Cost >= ScalarCost .Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (false)
5792	<< "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && !ChosenFactor .Width.isScalar() && ChosenFactor.Cost >= ScalarCost .Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (false)
5793	<< "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && !ChosenFactor .Width.isScalar() && ChosenFactor.Cost >= ScalarCost .Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (false);
5794	LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"; } } while (false);
5795	return ChosenFactor;
5796	}
5797
5798	bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5799	const Loop &L, ElementCount VF) const {
5800	// Cross iteration phis such as reductions need special handling and are
5801	// currently unsupported.
5802	if (any_of(L.getHeader()->phis(),
5803	[&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5804	return false;
5805
5806	// Phis with uses outside of the loop require special handling and are
5807	// currently unsupported.
5808	for (auto &Entry : Legal->getInductionVars()) {
5809	// Look for uses of the value of the induction at the last iteration.
5810	Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5811	for (User *U : PostInc->users())
5812	if (!L.contains(cast<Instruction>(U)))
5813	return false;
5814	// Look for uses of penultimate value of the induction.
5815	for (User *U : Entry.first->users())
5816	if (!L.contains(cast<Instruction>(U)))
5817	return false;
5818	}
5819
5820	// Induction variables that are widened require special handling that is
5821	// currently not supported.
5822	if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5823	return !(this->isScalarAfterVectorization(Entry.first, VF) \|\|
5824	this->isProfitableToScalarize(Entry.first, VF));
5825	}))
5826	return false;
5827
5828	// Epilogue vectorization code has not been auditted to ensure it handles
5829	// non-latch exits properly. It may be fine, but it needs auditted and
5830	// tested.
5831	if (L.getExitingBlock() != L.getLoopLatch())
5832	return false;
5833
5834	return true;
5835	}
5836
5837	bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5838	const ElementCount VF) const {
5839	// FIXME: We need a much better cost-model to take different parameters such
5840	// as register pressure, code size increase and cost of extra branches into
5841	// account. For now we apply a very crude heuristic and only consider loops
5842	// with vectorization factors larger than a certain value.
5843	// We also consider epilogue vectorization unprofitable for targets that don't
5844	// consider interleaving beneficial (eg. MVE).
5845	if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5846	return false;
5847	// FIXME: We should consider changing the threshold for scalable
5848	// vectors to take VScaleForTuning into account.
5849	if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5850	return true;
5851	return false;
5852	}
5853
5854	VectorizationFactor
5855	LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5856	const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5857	VectorizationFactor Result = VectorizationFactor::Disabled();
5858	if (!EnableEpilogueVectorization) {
5859	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is disabled.\n" ;; } } while (false);
5860	return Result;
5861	}
5862
5863	if (!isScalarEpilogueAllowed()) {
5864	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " "allowed.\n";; } } while (false)
5865	dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " "allowed.\n";; } } while (false)
5866	"allowed.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " "allowed.\n";; } } while (false);
5867	return Result;
5868	}
5869
5870	// Not really a cost consideration, but check for unsupported cases here to
5871	// simplify the logic.
5872	if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5873	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is " "not a supported candidate.\n";; } } while (false)
5874	dbgs() << "LEV: Unable to vectorize epilogue because the loop is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is " "not a supported candidate.\n";; } } while (false)
5875	"not a supported candidate.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is " "not a supported candidate.\n";; } } while (false);
5876	return Result;
5877	}
5878
5879	if (EpilogueVectorizationForceVF > 1) {
5880	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization factor is forced.\n" ;; } } while (false);
5881	ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5882	if (LVP.hasPlanWithVF(ForcedEC))
5883	return {ForcedEC, 0};
5884	else {
5885	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n" ;; } } while (false)
5886	dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n" ;; } } while (false)
5887	<< "LEV: Epilogue vectorization forced factor is not viable.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n" ;; } } while (false);
5888	return Result;
5889	}
5890	}
5891
5892	if (TheLoop->getHeader()->getParent()->hasOptSize() \|\|
5893	TheLoop->getHeader()->getParent()->hasMinSize()) {
5894	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n" ;; } } while (false)
5895	dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n" ;; } } while (false)
5896	<< "LEV: Epilogue vectorization skipped due to opt for size.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n" ;; } } while (false);
5897	return Result;
5898	}
5899
5900	if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5901	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"; } } while (false)
5902	"this loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"; } } while (false);
5903	return Result;
5904	}
5905
5906	// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5907	// the main loop handles 8 lanes per iteration. We could still benefit from
5908	// vectorizing the epilogue loop with VF=4.
5909	ElementCount EstimatedRuntimeVF = MainLoopVF;
5910	if (MainLoopVF.isScalable()) {
5911	EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5912	if (Optional<unsigned> VScale = getVScaleForTuning())
5913	EstimatedRuntimeVF *= VScale.getValue();
5914	}
5915
5916	for (auto &NextVF : ProfitableVFs)
5917	if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5918	ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) \|\|
5919	ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5920	(Result.Width.isScalar() \|\| isMoreProfitable(NextVF, Result)) &&
5921	LVP.hasPlanWithVF(NextVF.Width))
5922	Result = NextVF;
5923
5924	if (Result != VectorizationFactor::Disabled())
5925	LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = " << Result.Width << "\n";; } } while (false)
5926	<< Result.Width << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = " << Result.Width << "\n";; } } while (false);
5927	return Result;
5928	}
5929
5930	std::pair<unsigned, unsigned>
5931	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5932	unsigned MinWidth = -1U;
5933	unsigned MaxWidth = 8;
5934	const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5935	// For in-loop reductions, no element types are added to ElementTypesInLoop
5936	// if there are no loads/stores in the loop. In this case, check through the
5937	// reduction variables to determine the maximum width.
5938	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5939	// Reset MaxWidth so that we can find the smallest type used by recurrences
5940	// in the loop.
5941	MaxWidth = -1U;
5942	for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5943	const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5944	// When finding the min width used by the recurrence we need to account
5945	// for casts on the input operands of the recurrence.
5946	MaxWidth = std::min<unsigned>(
5947	MaxWidth, std::min<unsigned>(
5948	RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5949	RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5950	}
5951	} else {
5952	for (Type *T : ElementTypesInLoop) {
5953	MinWidth = std::min<unsigned>(
5954	MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5955	MaxWidth = std::max<unsigned>(
5956	MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5957	}
5958	}
5959	return {MinWidth, MaxWidth};
5960	}
5961
5962	void LoopVectorizationCostModel::collectElementTypesForWidening() {
5963	ElementTypesInLoop.clear();
5964	// For each block.
5965	for (BasicBlock *BB : TheLoop->blocks()) {
5966	// For each instruction in the loop.
5967	for (Instruction &I : BB->instructionsWithoutDebug()) {
5968	Type *T = I.getType();
5969
5970	// Skip ignored values.
5971	if (ValuesToIgnore.count(&I))
5972	continue;
5973
5974	// Only examine Loads, Stores and PHINodes.
5975	if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5976	continue;
5977
5978	// Examine PHI nodes that are reduction variables. Update the type to
5979	// account for the recurrence type.
5980	if (auto *PN = dyn_cast<PHINode>(&I)) {
5981	if (!Legal->isReductionVariable(PN))
5982	continue;
5983	const RecurrenceDescriptor &RdxDesc =
5984	Legal->getReductionVars().find(PN)->second;
5985	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
5986	TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5987	RdxDesc.getRecurrenceType(),
5988	TargetTransformInfo::ReductionFlags()))
5989	continue;
5990	T = RdxDesc.getRecurrenceType();
5991	}
5992
5993	// Examine the stored values.
5994	if (auto *ST = dyn_cast<StoreInst>(&I))
5995	T = ST->getValueOperand()->getType();
5996
5997	assert(T->isSized() &&(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized" ) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5998, __extension__ __PRETTY_FUNCTION__))
5998	"Expected the load/store/recurrence type to be sized")(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized" ) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5998, __extension__ __PRETTY_FUNCTION__));
5999
6000	ElementTypesInLoop.insert(T);
6001	}
6002	}
6003	}
6004
6005	unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6006	unsigned LoopCost) {
6007	// -- The interleave heuristics --
6008	// We interleave the loop in order to expose ILP and reduce the loop overhead.
6009	// There are many micro-architectural considerations that we can't predict
6010	// at this level. For example, frontend pressure (on decode or fetch) due to
6011	// code size, or the number and capabilities of the execution ports.
6012	//
6013	// We use the following heuristics to select the interleave count:
6014	// 1. If the code has reductions, then we interleave to break the cross
6015	// iteration dependency.
6016	// 2. If the loop is really small, then we interleave to reduce the loop
6017	// overhead.
6018	// 3. We don't interleave if we think that we will spill registers to memory
6019	// due to the increased register pressure.
6020
6021	if (!isScalarEpilogueAllowed())
6022	return 1;
6023
6024	// We used the distance for the interleave count.
6025	if (Legal->getMaxSafeDepDistBytes() != -1U)
6026	return 1;
6027
6028	auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6029	const bool HasReductions = !Legal->getReductionVars().empty();
6030	// Do not interleave loops with a relatively small known or estimated trip
6031	// count. But we will interleave when InterleaveSmallLoopScalarReduction is
6032	// enabled, and the code has scalar reductions(HasReductions && VF = 1),
6033	// because with the above conditions interleaving can expose ILP and break
6034	// cross iteration dependences for reductions.
6035	if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6036	!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6037	return 1;
6038
6039	RegisterUsage R = calculateRegisterUsage({VF})[0];
6040	// We divide by these constants so assume that we have at least one
6041	// instruction that uses at least one register.
6042	for (auto& pair : R.MaxLocalUsers) {
6043	pair.second = std::max(pair.second, 1U);
6044	}
6045
6046	// We calculate the interleave count using the following formula.
6047	// Subtract the number of loop invariants from the number of available
6048	// registers. These registers are used by all of the interleaved instances.
6049	// Next, divide the remaining registers by the number of registers that is
6050	// required by the loop, in order to estimate how many parallel instances
6051	// fit without causing spills. All of this is rounded down if necessary to be
6052	// a power of two. We want power of two interleave count to simplify any
6053	// addressing operations or alignment considerations.
6054	// We also want power of two interleave counts to ensure that the induction
6055	// variable of the vector loop wraps to zero, when tail is folded by masking;
6056	// this currently happens when OptForSize, in which case IC is set to 1 above.
6057	unsigned IC = UINT_MAX(2147483647 *2U +1U);
6058
6059	for (auto& pair : R.MaxLocalUsers) {
6060	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6061	LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName (pair.first) << " register class\n"; } } while (false)
6062	<< " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName (pair.first) << " register class\n"; } } while (false)
6063	<< TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName (pair.first) << " register class\n"; } } while (false);
6064	if (VF.isScalar()) {
6065	if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6066	TargetNumRegisters = ForceTargetNumScalarRegs;
6067	} else {
6068	if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6069	TargetNumRegisters = ForceTargetNumVectorRegs;
6070	}
6071	unsigned MaxLocalUsers = pair.second;
6072	unsigned LoopInvariantRegs = 0;
6073	if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6074	LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6075
6076	unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6077	// Don't count the induction variable as interleaved.
6078	if (EnableIndVarRegisterHeur) {
6079	TmpIC =
6080	PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6081	std::max(1U, (MaxLocalUsers - 1)));
6082	}
6083
6084	IC = std::min(IC, TmpIC);
6085	}
6086
6087	// Clamp the interleave ranges to reasonable counts.
6088	unsigned MaxInterleaveCount =
6089	TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6090
6091	// Check if the user has overridden the max.
6092	if (VF.isScalar()) {
6093	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6094	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6095	} else {
6096	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6097	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6098	}
6099
6100	// If trip count is known or estimated compile time constant, limit the
6101	// interleave count to be less than the trip count divided by VF, provided it
6102	// is at least 1.
6103	//
6104	// For scalable vectors we can't know if interleaving is beneficial. It may
6105	// not be beneficial for small loops if none of the lanes in the second vector
6106	// iterations is enabled. However, for larger loops, there is likely to be a
6107	// similar benefit as for fixed-width vectors. For now, we choose to leave
6108	// the InterleaveCount as if vscale is '1', although if some information about
6109	// the vector is known (e.g. min vector size), we can make a better decision.
6110	if (BestKnownTC) {
6111	MaxInterleaveCount =
6112	std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6113	// Make sure MaxInterleaveCount is greater than 0.
6114	MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6115	}
6116
6117	assert(MaxInterleaveCount > 0 &&(static_cast <bool> (MaxInterleaveCount > 0 && "Maximum interleave count must be greater than 0") ? void (0 ) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6118, __extension__ __PRETTY_FUNCTION__))
6118	"Maximum interleave count must be greater than 0")(static_cast <bool> (MaxInterleaveCount > 0 && "Maximum interleave count must be greater than 0") ? void (0 ) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6118, __extension__ __PRETTY_FUNCTION__));
6119
6120	// Clamp the calculated IC to be between the 1 and the max interleave count
6121	// that the target and trip count allows.
6122	if (IC > MaxInterleaveCount)
6123	IC = MaxInterleaveCount;
6124	else
6125	// Make sure IC is greater than 0.
6126	IC = std::max(1u, IC);
6127
6128	assert(IC > 0 && "Interleave count must be greater than 0.")(static_cast <bool> (IC > 0 && "Interleave count must be greater than 0." ) ? void (0) : __assert_fail ("IC > 0 && \"Interleave count must be greater than 0.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6128, __extension__ __PRETTY_FUNCTION__));
6129
6130	// If we did not calculate the cost for VF (because the user selected the VF)
6131	// then we calculate the cost of VF here.
6132	if (LoopCost == 0) {
6133	InstructionCost C = expectedCost(VF).first;
6134	assert(C.isValid() && "Expected to have chosen a VF with valid cost")(static_cast <bool> (C.isValid() && "Expected to have chosen a VF with valid cost" ) ? void (0) : __assert_fail ("C.isValid() && \"Expected to have chosen a VF with valid cost\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6134, __extension__ __PRETTY_FUNCTION__));
6135	LoopCost = *C.getValue();
6136	}
6137
6138	assert(LoopCost && "Non-zero loop cost expected")(static_cast <bool> (LoopCost && "Non-zero loop cost expected" ) ? void (0) : __assert_fail ("LoopCost && \"Non-zero loop cost expected\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6138, __extension__ __PRETTY_FUNCTION__));
6139
6140	// Interleave if we vectorized this loop and there is a reduction that could
6141	// benefit from interleaving.
6142	if (VF.isVector() && HasReductions) {
6143	LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n" ; } } while (false);
6144	return IC;
6145	}
6146
6147	// Note that if we've already vectorized the loop we will have done the
6148	// runtime check and so interleaving won't require further checks.
6149	bool InterleavingRequiresRuntimePointerCheck =
6150	(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6151
6152	// We want to interleave small loops in order to reduce the loop overhead and
6153	// potentially expose ILP opportunities.
6154	LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' << "LV: VF is " << VF << '\n'; } } while (false)
6155	<< "LV: IC is " << IC << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' << "LV: VF is " << VF << '\n'; } } while (false)
6156	<< "LV: VF is " << VF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' << "LV: VF is " << VF << '\n'; } } while (false);
6157	const bool AggressivelyInterleaveReductions =
6158	TTI.enableAggressiveInterleaving(HasReductions);
6159	if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6160	// We assume that the cost overhead is 1 and we use the cost model
6161	// to estimate the cost of the loop and interleave until the cost of the
6162	// loop overhead is about 5% of the cost of the loop.
6163	unsigned SmallIC =
6164	std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6165
6166	// Interleave until store/load ports (estimated by max interleave count) are
6167	// saturated.
6168	unsigned NumStores = Legal->getNumStores();
6169	unsigned NumLoads = Legal->getNumLoads();
6170	unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6171	unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6172
6173	// There is little point in interleaving for reductions containing selects
6174	// and compares when VF=1 since it may just create more overhead than it's
6175	// worth for loops with small trip counts. This is because we still have to
6176	// do the final reduction after the loop.
6177	bool HasSelectCmpReductions =
6178	HasReductions &&
6179	any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6180	const RecurrenceDescriptor &RdxDesc = Reduction.second;
6181	return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6182	RdxDesc.getRecurrenceKind());
6183	});
6184	if (HasSelectCmpReductions) {
6185	LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not interleaving select-cmp reductions.\n" ; } } while (false);
6186	return 1;
6187	}
6188
6189	// If we have a scalar reduction (vector reductions are already dealt with
6190	// by this point), we can increase the critical path length if the loop
6191	// we're interleaving is inside another loop. For tree-wise reductions
6192	// set the limit to 2, and for ordered reductions it's best to disable
6193	// interleaving entirely.
6194	if (HasReductions && TheLoop->getLoopDepth() > 1) {
6195	bool HasOrderedReductions =
6196	any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6197	const RecurrenceDescriptor &RdxDesc = Reduction.second;
6198	return RdxDesc.isOrdered();
6199	});
6200	if (HasOrderedReductions) {
6201	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n" ; } } while (false)
6202	dbgs() << "LV: Not interleaving scalar ordered reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n" ; } } while (false);
6203	return 1;
6204	}
6205
6206	unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6207	SmallIC = std::min(SmallIC, F);
6208	StoresIC = std::min(StoresIC, F);
6209	LoadsIC = std::min(LoadsIC, F);
6210	}
6211
6212	if (EnableLoadStoreRuntimeInterleave &&
6213	std::max(StoresIC, LoadsIC) > SmallIC) {
6214	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n" ; } } while (false)
6215	dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n" ; } } while (false);
6216	return std::max(StoresIC, LoadsIC);
6217	}
6218
6219	// If there are scalar reductions and TTI has enabled aggressive
6220	// interleaving for reductions, we will interleave to expose ILP.
6221	if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6222	AggressivelyInterleaveReductions) {
6223	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n" ; } } while (false);
6224	// Interleave no less than SmallIC but not as aggressive as the normal IC
6225	// to satisfy the rare situation when resources are too limited.
6226	return std::max(IC / 2, SmallIC);
6227	} else {
6228	LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n" ; } } while (false);
6229	return SmallIC;
6230	}
6231	}
6232
6233	// Interleave if this is a large loop (small loops are already dealt with by
6234	// this point) that could benefit from interleaving.
6235	if (AggressivelyInterleaveReductions) {
6236	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n" ; } } while (false);
6237	return IC;
6238	}
6239
6240	LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n" ; } } while (false);
6241	return 1;
6242	}
6243
6244	SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6245	LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6246	// This function calculates the register usage by measuring the highest number
6247	// of values that are alive at a single location. Obviously, this is a very
6248	// rough estimation. We scan the loop in a topological order in order and
6249	// assign a number to each instruction. We use RPO to ensure that defs are
6250	// met before their users. We assume that each instruction that has in-loop
6251	// users starts an interval. We record every time that an in-loop value is
6252	// used, so we have a list of the first and last occurrences of each
6253	// instruction. Next, we transpose this data structure into a multi map that
6254	// holds the list of intervals that end at a specific location. This multi
6255	// map allows us to perform a linear search. We scan the instructions linearly
6256	// and record each time that a new interval starts, by placing it in a set.
6257	// If we find this value in the multi-map then we remove it from the set.
6258	// The max register usage is the maximum size of the set.
6259	// We also search for instructions that are defined outside the loop, but are
6260	// used inside the loop. We need this number separately from the max-interval
6261	// usage number because when we unroll, loop-invariant values do not take
6262	// more register.
6263	LoopBlocksDFS DFS(TheLoop);
6264	DFS.perform(LI);
6265
6266	RegisterUsage RU;
6267
6268	// Each 'key' in the map opens a new interval. The values
6269	// of the map are the index of the 'last seen' usage of the
6270	// instruction that is the key.
6271	using IntervalMap = DenseMap<Instruction *, unsigned>;
6272
6273	// Maps instruction to its index.
6274	SmallVector<Instruction *, 64> IdxToInstr;
6275	// Marks the end of each interval.
6276	IntervalMap EndPoint;
6277	// Saves the list of instruction indices that are used in the loop.
6278	SmallPtrSet<Instruction *, 8> Ends;
6279	// Saves the list of values that are used in the loop but are
6280	// defined outside the loop, such as arguments and constants.
6281	SmallPtrSet<Value *, 8> LoopInvariants;
6282
6283	for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6284	for (Instruction &I : BB->instructionsWithoutDebug()) {
6285	IdxToInstr.push_back(&I);
6286
6287	// Save the end location of each USE.
6288	for (Value *U : I.operands()) {
6289	auto *Instr = dyn_cast<Instruction>(U);
6290
6291	// Ignore non-instruction values such as arguments, constants, etc.
6292	if (!Instr)
6293	continue;
6294
6295	// If this instruction is outside the loop then record it and continue.
6296	if (!TheLoop->contains(Instr)) {
6297	LoopInvariants.insert(Instr);
6298	continue;
6299	}
6300
6301	// Overwrite previous end points.
6302	EndPoint[Instr] = IdxToInstr.size();
6303	Ends.insert(Instr);
6304	}
6305	}
6306	}
6307
6308	// Saves the list of intervals that end with the index in 'key'.
6309	using InstrList = SmallVector<Instruction *, 2>;
6310	DenseMap<unsigned, InstrList> TransposeEnds;
6311
6312	// Transpose the EndPoints to a list of values that end at each index.
6313	for (auto &Interval : EndPoint)
6314	TransposeEnds[Interval.second].push_back(Interval.first);
6315
6316	SmallPtrSet<Instruction *, 8> OpenIntervals;
6317	SmallVector<RegisterUsage, 8> RUs(VFs.size());
6318	SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6319
6320	LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n" ; } } while (false);
6321
6322	// A lambda that gets the register usage for the given type and VF.
6323	const auto &TTICapture = TTI;
6324	auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6325	if (Ty->isTokenTy() \|\| !VectorType::isValidElementType(Ty))
6326	return 0;
6327	InstructionCost::CostType RegUsage =
6328	*TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6329	assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&(static_cast <bool> (RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && "Nonsensical values for register usage." ) ? void (0) : __assert_fail ("RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && \"Nonsensical values for register usage.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6330, __extension__ __PRETTY_FUNCTION__))
6330	"Nonsensical values for register usage.")(static_cast <bool> (RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && "Nonsensical values for register usage." ) ? void (0) : __assert_fail ("RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && \"Nonsensical values for register usage.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6330, __extension__ __PRETTY_FUNCTION__));
6331	return RegUsage;
6332	};
6333
6334	for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6335	Instruction *I = IdxToInstr[i];
6336
6337	// Remove all of the instructions that end at this location.
6338	InstrList &List = TransposeEnds[i];
6339	for (Instruction *ToRemove : List)
6340	OpenIntervals.erase(ToRemove);
6341
6342	// Ignore instructions that are never used within the loop.
6343	if (!Ends.count(I))
6344	continue;
6345
6346	// Skip ignored values.
6347	if (ValuesToIgnore.count(I))
6348	continue;
6349
6350	// For each VF find the maximum usage of registers.
6351	for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6352	// Count the number of live intervals.
6353	SmallMapVector<unsigned, unsigned, 4> RegUsage;
6354
6355	if (VFs[j].isScalar()) {
6356	for (auto Inst : OpenIntervals) {
6357	unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6358	if (RegUsage.find(ClassID) == RegUsage.end())
6359	RegUsage[ClassID] = 1;
6360	else
6361	RegUsage[ClassID] += 1;
6362	}
6363	} else {
6364	collectUniformsAndScalars(VFs[j]);
6365	for (auto Inst : OpenIntervals) {
6366	// Skip ignored values for VF > 1.
6367	if (VecValuesToIgnore.count(Inst))
6368	continue;
6369	if (isScalarAfterVectorization(Inst, VFs[j])) {
6370	unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6371	if (RegUsage.find(ClassID) == RegUsage.end())
6372	RegUsage[ClassID] = 1;
6373	else
6374	RegUsage[ClassID] += 1;
6375	} else {
6376	unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6377	if (RegUsage.find(ClassID) == RegUsage.end())
6378	RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6379	else
6380	RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6381	}
6382	}
6383	}
6384
6385	for (auto& pair : RegUsage) {
6386	if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6387	MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6388	else
6389	MaxUsages[j][pair.first] = pair.second;
6390	}
6391	}
6392
6393	LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (false)
6394	<< OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (false);
6395
6396	// Add the current instruction to the list of open intervals.
6397	OpenIntervals.insert(I);
6398	}
6399
6400	for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6401	SmallMapVector<unsigned, unsigned, 4> Invariant;
6402
6403	for (auto Inst : LoopInvariants) {
6404	unsigned Usage =
6405	VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6406	unsigned ClassID =
6407	TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6408	if (Invariant.find(ClassID) == Invariant.end())
6409	Invariant[ClassID] = Usage;
6410	else
6411	Invariant[ClassID] += Usage;
6412	}
6413
6414	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6415	dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6416	dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6417	<< " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6418	for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6419	dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6420	<< TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6421	<< " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6422	}do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6423	dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6424	<< " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6425	for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6426	dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6427	<< TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6428	<< " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6429	}do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false)
6430	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() << " item\n"; for (const auto &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; } }; } } while (false);
6431
6432	RU.LoopInvariantRegs = Invariant;
6433	RU.MaxLocalUsers = MaxUsages[i];
6434	RUs[i] = RU;
6435	}
6436
6437	return RUs;
6438	}
6439
6440	bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6441	ElementCount VF) {
6442	// TODO: Cost model for emulated masked load/store is completely
6443	// broken. This hack guides the cost model to use an artificially
6444	// high enough value to practically disable vectorization with such
6445	// operations, except where previously deployed legality hack allowed
6446	// using very low cost values. This is to avoid regressions coming simply
6447	// from moving "masked load/store" check from legality to cost model.
6448	// Masked Load/Gather emulation was previously never allowed.
6449	// Limited number of Masked Store/Scatter emulation was allowed.
6450	assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction")(static_cast <bool> (isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction") ? void (0) : __assert_fail ("isPredicatedInst(I, VF) && \"Expecting a scalar emulated instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6450, __extension__ __PRETTY_FUNCTION__));
6451	return isa<LoadInst>(I) \|\|
6452	(isa<StoreInst>(I) &&
6453	NumPredStores > NumberOfStoresToPredicate);
6454	}
6455
6456	void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6457	// If we aren't vectorizing the loop, or if we've already collected the
6458	// instructions to scalarize, there's nothing to do. Collection may already
6459	// have occurred if we have a user-selected VF and are now computing the
6460	// expected cost for interleaving.
6461	if (VF.isScalar() \|\| VF.isZero() \|\|
6462	InstsToScalarize.find(VF) != InstsToScalarize.end())
6463	return;
6464
6465	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6466	// not profitable to scalarize any instructions, the presence of VF in the
6467	// map will indicate that we've analyzed it already.
6468	ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6469
6470	// Find all the instructions that are scalar with predication in the loop and
6471	// determine if it would be better to not if-convert the blocks they are in.
6472	// If so, we also record the instructions to scalarize.
6473	for (BasicBlock *BB : TheLoop->blocks()) {
6474	if (!blockNeedsPredicationForAnyReason(BB))
6475	continue;
6476	for (Instruction &I : *BB)
6477	if (isScalarWithPredication(&I, VF)) {
6478	ScalarCostsTy ScalarCosts;
6479	// Do not apply discount if scalable, because that would lead to
6480	// invalid scalarization costs.
6481	// Do not apply discount logic if hacked cost is needed
6482	// for emulated masked memrefs.
6483	if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6484	computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6485	ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6486	// Remember that BB will remain after vectorization.
6487	PredicatedBBsAfterVectorization.insert(BB);
6488	}
6489	}
6490	}
6491
6492	int LoopVectorizationCostModel::computePredInstDiscount(
6493	Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6494	assert(!isUniformAfterVectorization(PredInst, VF) &&(static_cast <bool> (!isUniformAfterVectorization(PredInst , VF) && "Instruction marked uniform-after-vectorization will be predicated" ) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6495, __extension__ __PRETTY_FUNCTION__))
6495	"Instruction marked uniform-after-vectorization will be predicated")(static_cast <bool> (!isUniformAfterVectorization(PredInst , VF) && "Instruction marked uniform-after-vectorization will be predicated" ) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6495, __extension__ __PRETTY_FUNCTION__));
6496
6497	// Initialize the discount to zero, meaning that the scalar version and the
6498	// vector version cost the same.
6499	InstructionCost Discount = 0;
6500
6501	// Holds instructions to analyze. The instructions we visit are mapped in
6502	// ScalarCosts. Those instructions are the ones that would be scalarized if
6503	// we find that the scalar version costs less.
6504	SmallVector<Instruction *, 8> Worklist;
6505
6506	// Returns true if the given instruction can be scalarized.
6507	auto canBeScalarized = [&](Instruction *I) -> bool {
6508	// We only attempt to scalarize instructions forming a single-use chain
6509	// from the original predicated block that would otherwise be vectorized.
6510	// Although not strictly necessary, we give up on instructions we know will
6511	// already be scalar to avoid traversing chains that are unlikely to be
6512	// beneficial.
6513	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
6514	isScalarAfterVectorization(I, VF))
6515	return false;
6516
6517	// If the instruction is scalar with predication, it will be analyzed
6518	// separately. We ignore it within the context of PredInst.
6519	if (isScalarWithPredication(I, VF))
6520	return false;
6521
6522	// If any of the instruction's operands are uniform after vectorization,
6523	// the instruction cannot be scalarized. This prevents, for example, a
6524	// masked load from being scalarized.
6525	//
6526	// We assume we will only emit a value for lane zero of an instruction
6527	// marked uniform after vectorization, rather than VF identical values.
6528	// Thus, if we scalarize an instruction that uses a uniform, we would
6529	// create uses of values corresponding to the lanes we aren't emitting code
6530	// for. This behavior can be changed by allowing getScalarValue to clone
6531	// the lane zero values for uniforms rather than asserting.
6532	for (Use &U : I->operands())
6533	if (auto *J = dyn_cast<Instruction>(U.get()))
6534	if (isUniformAfterVectorization(J, VF))
6535	return false;
6536
6537	// Otherwise, we can scalarize the instruction.
6538	return true;
6539	};
6540
6541	// Compute the expected cost discount from scalarizing the entire expression
6542	// feeding the predicated instruction. We currently only consider expressions
6543	// that are single-use instruction chains.
6544	Worklist.push_back(PredInst);
6545	while (!Worklist.empty()) {
6546	Instruction *I = Worklist.pop_back_val();
6547
6548	// If we've already analyzed the instruction, there's nothing to do.
6549	if (ScalarCosts.find(I) != ScalarCosts.end())
6550	continue;
6551
6552	// Compute the cost of the vector instruction. Note that this cost already
6553	// includes the scalarization overhead of the predicated instruction.
6554	InstructionCost VectorCost = getInstructionCost(I, VF).first;
6555
6556	// Compute the cost of the scalarized instruction. This cost is the cost of
6557	// the instruction as if it wasn't if-converted and instead remained in the
6558	// predicated block. We will scale this cost by block probability after
6559	// computing the scalarization overhead.
6560	InstructionCost ScalarCost =
6561	VF.getFixedValue() *
6562	getInstructionCost(I, ElementCount::getFixed(1)).first;
6563
6564	// Compute the scalarization overhead of needed insertelement instructions
6565	// and phi nodes.
6566	if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6567	ScalarCost += TTI.getScalarizationOverhead(
6568	cast<VectorType>(ToVectorTy(I->getType(), VF)),
6569	APInt::getAllOnes(VF.getFixedValue()), true, false);
6570	ScalarCost +=
6571	VF.getFixedValue() *
6572	TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6573	}
6574
6575	// Compute the scalarization overhead of needed extractelement
6576	// instructions. For each of the instruction's operands, if the operand can
6577	// be scalarized, add it to the worklist; otherwise, account for the
6578	// overhead.
6579	for (Use &U : I->operands())
6580	if (auto *J = dyn_cast<Instruction>(U.get())) {
6581	assert(VectorType::isValidElementType(J->getType()) &&(static_cast <bool> (VectorType::isValidElementType(J-> getType()) && "Instruction has non-scalar type") ? void (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6582, __extension__ __PRETTY_FUNCTION__))
6582	"Instruction has non-scalar type")(static_cast <bool> (VectorType::isValidElementType(J-> getType()) && "Instruction has non-scalar type") ? void (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6582, __extension__ __PRETTY_FUNCTION__));
6583	if (canBeScalarized(J))
6584	Worklist.push_back(J);
6585	else if (needsExtract(J, VF)) {
6586	ScalarCost += TTI.getScalarizationOverhead(
6587	cast<VectorType>(ToVectorTy(J->getType(), VF)),
6588	APInt::getAllOnes(VF.getFixedValue()), false, true);
6589	}
6590	}
6591
6592	// Scale the total scalar cost by block probability.
6593	ScalarCost /= getReciprocalPredBlockProb();
6594
6595	// Compute the discount. A non-negative discount means the vector version
6596	// of the instruction costs more, and scalarizing would be beneficial.
6597	Discount += VectorCost - ScalarCost;
6598	ScalarCosts[I] = ScalarCost;
6599	}
6600
6601	return *Discount.getValue();
6602	}
6603
6604	LoopVectorizationCostModel::VectorizationCostTy
6605	LoopVectorizationCostModel::expectedCost(
6606	ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6607	VectorizationCostTy Cost;
6608
6609	// For each block.
6610	for (BasicBlock *BB : TheLoop->blocks()) {
6611	VectorizationCostTy BlockCost;
6612
6613	// For each instruction in the old loop.
6614	for (Instruction &I : BB->instructionsWithoutDebug()) {
6615	// Skip ignored values.
6616	if (ValuesToIgnore.count(&I) \|\|
6617	(VF.isVector() && VecValuesToIgnore.count(&I)))
6618	continue;
6619
6620	VectorizationCostTy C = getInstructionCost(&I, VF);
6621
6622	// Check if we should override the cost.
6623	if (C.first.isValid() &&
6624	ForceTargetInstructionCost.getNumOccurrences() > 0)
6625	C.first = InstructionCost(ForceTargetInstructionCost);
6626
6627	// Keep a list of instructions with invalid costs.
6628	if (Invalid && !C.first.isValid())
6629	Invalid->emplace_back(&I, VF);
6630
6631	BlockCost.first += C.first;
6632	BlockCost.second \|= C.second;
6633	LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " << VF << " For instruction: " << I << '\n'; } } while (false)
6634	<< " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " << VF << " For instruction: " << I << '\n'; } } while (false)
6635	<< '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " << VF << " For instruction: " << I << '\n'; } } while (false);
6636	}
6637
6638	// If we are vectorizing a predicated block, it will have been
6639	// if-converted. This means that the block's instructions (aside from
6640	// stores and instructions that may divide by zero) will now be
6641	// unconditionally executed. For the scalar case, we may not always execute
6642	// the predicated block, if it is an if-else block. Thus, scale the block's
6643	// cost by the probability of executing it. blockNeedsPredication from
6644	// Legal is used so as to not include all blocks in tail folded loops.
6645	if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6646	BlockCost.first /= getReciprocalPredBlockProb();
6647
6648	Cost.first += BlockCost.first;
6649	Cost.second \|= BlockCost.second;
6650	}
6651
6652	return Cost;
6653	}
6654
6655	/// Gets Address Access SCEV after verifying that the access pattern
6656	/// is loop invariant except the induction variable dependence.
6657	///
6658	/// This SCEV can be sent to the Target in order to estimate the address
6659	/// calculation cost.
6660	static const SCEV *getAddressAccessSCEV(
6661	Value *Ptr,
6662	LoopVectorizationLegality *Legal,
6663	PredicatedScalarEvolution &PSE,
6664	const Loop *TheLoop) {
6665
6666	auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6667	if (!Gep)
6668	return nullptr;
6669
6670	// We are looking for a gep with all loop invariant indices except for one
6671	// which should be an induction variable.
6672	auto SE = PSE.getSE();
6673	unsigned NumOperands = Gep->getNumOperands();
6674	for (unsigned i = 1; i < NumOperands; ++i) {
6675	Value *Opd = Gep->getOperand(i);
6676	if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6677	!Legal->isInductionVariable(Opd))
6678	return nullptr;
6679	}
6680
6681	// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6682	return PSE.getSCEV(Ptr);
6683	}
6684
6685	static bool isStrideMul(Instruction I, LoopVectorizationLegality Legal) {
6686	return Legal->hasStride(I->getOperand(0)) \|\|
6687	Legal->hasStride(I->getOperand(1));
6688	}
6689
6690	InstructionCost
6691	LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6692	ElementCount VF) {
6693	assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization." ) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6694, __extension__ __PRETTY_FUNCTION__))
6694	"Scalarization cost of instruction implies vectorization.")(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization." ) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6694, __extension__ __PRETTY_FUNCTION__));
6695	if (VF.isScalable())
6696	return InstructionCost::getInvalid();
6697
6698	Type *ValTy = getLoadStoreType(I);
6699	auto SE = PSE.getSE();
6700
6701	unsigned AS = getLoadStoreAddressSpace(I);
6702	Value *Ptr = getLoadStorePointerOperand(I);
6703	Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6704	// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6705	// that it is being called from this specific place.
6706
6707	// Figure out whether the access is strided and get the stride value
6708	// if it's known in compile time
6709	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6710
6711	// Get the cost of the scalar memory instruction and address computation.
6712	InstructionCost Cost =
6713	VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6714
6715	// Don't pass *I here, since it is scalar but will actually be part of a
6716	// vectorized loop where the user of it is a vectorized instruction.
6717	const Align Alignment = getLoadStoreAlignment(I);
6718	Cost += VF.getKnownMinValue() *
6719	TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6720	AS, TTI::TCK_RecipThroughput);
6721
6722	// Get the overhead of the extractelement and insertelement instructions
6723	// we might create due to scalarization.
6724	Cost += getScalarizationOverhead(I, VF);
6725
6726	// If we have a predicated load/store, it will need extra i1 extracts and
6727	// conditional branches, but may not be executed for each vector lane. Scale
6728	// the cost by the probability of executing the predicated block.
6729	if (isPredicatedInst(I, VF)) {
6730	Cost /= getReciprocalPredBlockProb();
6731
6732	// Add the cost of an i1 extract and a branch
6733	auto *Vec_i1Ty =
6734	VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6735	Cost += TTI.getScalarizationOverhead(
6736	Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6737	/Insert=/false, /Extract=/true);
6738	Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6739
6740	if (useEmulatedMaskMemRefHack(I, VF))
6741	// Artificially setting to a high enough value to practically disable
6742	// vectorization with such operations.
6743	Cost = 3000000;
6744	}
6745
6746	return Cost;
6747	}
6748
6749	InstructionCost
6750	LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6751	ElementCount VF) {
6752	Type *ValTy = getLoadStoreType(I);
6753	auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6754	Value *Ptr = getLoadStorePointerOperand(I);
6755	unsigned AS = getLoadStoreAddressSpace(I);
6756	int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6757	enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6758
6759	assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access" ) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6760, __extension__ __PRETTY_FUNCTION__))
6760	"Stride should be 1 or -1 for consecutive memory access")(static_cast <bool> ((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access" ) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6760, __extension__ __PRETTY_FUNCTION__));
6761	const Align Alignment = getLoadStoreAlignment(I);
6762	InstructionCost Cost = 0;
6763	if (Legal->isMaskRequired(I))
6764	Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6765	CostKind);
6766	else
6767	Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6768	CostKind, I);
6769
6770	bool Reverse = ConsecutiveStride < 0;
6771	if (Reverse)
6772	Cost +=
6773	TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6774	return Cost;
6775	}
6776
6777	InstructionCost
6778	LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6779	ElementCount VF) {
6780	assert(Legal->isUniformMemOp(I))(static_cast <bool> (Legal->isUniformMemOp(I)) ? void (0) : __assert_fail ("Legal->isUniformMemOp(*I)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6780, __extension__ __PRETTY_FUNCTION__));
6781
6782	Type *ValTy = getLoadStoreType(I);
6783	auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6784	const Align Alignment = getLoadStoreAlignment(I);
6785	unsigned AS = getLoadStoreAddressSpace(I);
6786	enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6787	if (isa<LoadInst>(I)) {
6788	return TTI.getAddressComputationCost(ValTy) +
6789	TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6790	CostKind) +
6791	TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6792	}
6793	StoreInst *SI = cast<StoreInst>(I);
6794
6795	bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6796	return TTI.getAddressComputationCost(ValTy) +
6797	TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6798	CostKind) +
6799	(isLoopInvariantStoreValue
6800	? 0
6801	: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6802	VF.getKnownMinValue() - 1));
6803	}
6804
6805	InstructionCost
6806	LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6807	ElementCount VF) {
6808	Type *ValTy = getLoadStoreType(I);
6809	auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6810	const Align Alignment = getLoadStoreAlignment(I);
6811	const Value *Ptr = getLoadStorePointerOperand(I);
6812
6813	return TTI.getAddressComputationCost(VectorTy) +
6814	TTI.getGatherScatterOpCost(
6815	I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6816	TargetTransformInfo::TCK_RecipThroughput, I);
6817	}
6818
6819	InstructionCost
6820	LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6821	ElementCount VF) {
6822	// TODO: Once we have support for interleaving with scalable vectors
6823	// we can calculate the cost properly here.
6824	if (VF.isScalable())
6825	return InstructionCost::getInvalid();
6826
6827	Type *ValTy = getLoadStoreType(I);
6828	auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6829	unsigned AS = getLoadStoreAddressSpace(I);
6830
6831	auto Group = getInterleavedAccessGroup(I);
6832	assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group." ) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6832, __extension__ __PRETTY_FUNCTION__));
6833
6834	unsigned InterleaveFactor = Group->getFactor();
6835	auto WideVecTy = VectorType::get(ValTy, VF InterleaveFactor);
6836
6837	// Holds the indices of existing members in the interleaved group.
6838	SmallVector<unsigned, 4> Indices;
6839	for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6840	if (Group->getMember(IF))
6841	Indices.push_back(IF);
6842
6843	// Calculate the cost of the whole interleaved group.
6844	bool UseMaskForGaps =
6845	(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) \|\|
6846	(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6847	InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6848	I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6849	AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6850
6851	if (Group->isReverse()) {
6852	// TODO: Add support for reversed masked interleaved access.
6853	assert(!Legal->isMaskRequired(I) &&(static_cast <bool> (!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported.") ? void ( 0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6854, __extension__ __PRETTY_FUNCTION__))
6854	"Reverse masked interleaved access not supported.")(static_cast <bool> (!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported.") ? void ( 0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6854, __extension__ __PRETTY_FUNCTION__));
6855	Cost +=
6856	Group->getNumMembers() *
6857	TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6858	}
6859	return Cost;
6860	}
6861
6862	Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6863	Instruction I, ElementCount VF, Type Ty, TTI::TargetCostKind CostKind) {
6864	using namespace llvm::PatternMatch;
6865	// Early exit for no inloop reductions
6866	if (InLoopReductionChains.empty() \|\| VF.isScalar() \|\| !isa<VectorType>(Ty))
6867	return None;
6868	auto *VectorTy = cast<VectorType>(Ty);
6869
6870	// We are looking for a pattern of, and finding the minimal acceptable cost:
6871	// reduce(mul(ext(A), ext(B))) or
6872	// reduce(mul(A, B)) or
6873	// reduce(ext(A)) or
6874	// reduce(A).
6875	// The basic idea is that we walk down the tree to do that, finding the root
6876	// reduction instruction in InLoopReductionImmediateChains. From there we find
6877	// the pattern of mul/ext and test the cost of the entire pattern vs the cost
6878	// of the components. If the reduction cost is lower then we return it for the
6879	// reduction instruction and 0 for the other instructions in the pattern. If
6880	// it is not we return an invalid cost specifying the orignal cost method
6881	// should be used.
6882	Instruction *RetI = I;
6883	if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6884	if (!RetI->hasOneUser())
6885	return None;
6886	RetI = RetI->user_back();
6887	}
6888	if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6889	RetI->user_back()->getOpcode() == Instruction::Add) {
6890	if (!RetI->hasOneUser())
6891	return None;
6892	RetI = RetI->user_back();
6893	}
6894
6895	// Test if the found instruction is a reduction, and if not return an invalid
6896	// cost specifying the parent to use the original cost modelling.
6897	if (!InLoopReductionImmediateChains.count(RetI))
6898	return None;
6899
6900	// Find the reduction this chain is a part of and calculate the basic cost of
6901	// the reduction on its own.
6902	Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6903	Instruction *ReductionPhi = LastChain;
6904	while (!isa<PHINode>(ReductionPhi))
6905	ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6906
6907	const RecurrenceDescriptor &RdxDesc =
6908	Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6909
6910	InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6911	RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6912
6913	// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6914	// normal fmul instruction to the cost of the fadd reduction.
6915	if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6916	BaseCost +=
6917	TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6918
6919	// If we're using ordered reductions then we can just return the base cost
6920	// here, since getArithmeticReductionCost calculates the full ordered
6921	// reduction cost when FP reassociation is not allowed.
6922	if (useOrderedReductions(RdxDesc))
6923	return BaseCost;
6924
6925	// Get the operand that was not the reduction chain and match it to one of the
6926	// patterns, returning the better cost if it is found.
6927	Instruction *RedOp = RetI->getOperand(1) == LastChain
6928	? dyn_cast<Instruction>(RetI->getOperand(0))
6929	: dyn_cast<Instruction>(RetI->getOperand(1));
6930
6931	VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6932
6933	Instruction Op0, Op1;
6934	if (RedOp &&
6935	match(RedOp,
6936	m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6937	match(Op0, m_ZExtOrSExt(m_Value())) &&
6938	Op0->getOpcode() == Op1->getOpcode() &&
6939	Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6940	!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6941	(Op0->getOpcode() == RedOp->getOpcode() \|\| Op0 == Op1)) {
6942
6943	// Matched reduce(ext(mul(ext(A), ext(B)))
6944	// Note that the extend opcodes need to all match, or if A==B they will have
6945	// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6946	// which is equally fine.
6947	bool IsUnsigned = isa<ZExtInst>(Op0);
6948	auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6949	auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6950
6951	InstructionCost ExtCost =
6952	TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6953	TTI::CastContextHint::None, CostKind, Op0);
6954	InstructionCost MulCost =
6955	TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6956	InstructionCost Ext2Cost =
6957	TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6958	TTI::CastContextHint::None, CostKind, RedOp);
6959
6960	InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6961	/IsMLA=/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6962	CostKind);
6963
6964	if (RedCost.isValid() &&
6965	RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6966	return I == RetI ? RedCost : 0;
6967	} else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6968	!TheLoop->isLoopInvariant(RedOp)) {
6969	// Matched reduce(ext(A))
6970	bool IsUnsigned = isa<ZExtInst>(RedOp);
6971	auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6972	InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6973	/IsMLA=/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6974	CostKind);
6975
6976	InstructionCost ExtCost =
6977	TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6978	TTI::CastContextHint::None, CostKind, RedOp);
6979	if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6980	return I == RetI ? RedCost : 0;
6981	} else if (RedOp &&
6982	match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6983	if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6984	Op0->getOpcode() == Op1->getOpcode() &&
6985	!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6986	bool IsUnsigned = isa<ZExtInst>(Op0);
6987	Type *Op0Ty = Op0->getOperand(0)->getType();
6988	Type *Op1Ty = Op1->getOperand(0)->getType();
6989	Type *LargestOpTy =
6990	Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6991	: Op0Ty;
6992	auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6993
6994	// Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6995	// different sizes. We take the largest type as the ext to reduce, and add
6996	// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6997	InstructionCost ExtCost0 = TTI.getCastInstrCost(
6998	Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6999	TTI::CastContextHint::None, CostKind, Op0);
7000	InstructionCost ExtCost1 = TTI.getCastInstrCost(
7001	Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7002	TTI::CastContextHint::None, CostKind, Op1);
7003	InstructionCost MulCost =
7004	TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7005
7006	InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7007	/IsMLA=/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7008	CostKind);
7009	InstructionCost ExtraExtCost = 0;
7010	if (Op0Ty != LargestOpTy \|\| Op1Ty != LargestOpTy) {
7011	Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7012	ExtraExtCost = TTI.getCastInstrCost(
7013	ExtraExtOp->getOpcode(), ExtType,
7014	VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7015	TTI::CastContextHint::None, CostKind, ExtraExtOp);
7016	}
7017
7018	if (RedCost.isValid() &&
7019	(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7020	return I == RetI ? RedCost : 0;
7021	} else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7022	// Matched reduce(mul())
7023	InstructionCost MulCost =
7024	TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7025
7026	InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7027	/IsMLA=/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7028	CostKind);
7029
7030	if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7031	return I == RetI ? RedCost : 0;
7032	}
7033	}
7034
7035	return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7036	}
7037
7038	InstructionCost
7039	LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7040	ElementCount VF) {
7041	// Calculate scalar cost only. Vectorization cost should be ready at this
7042	// moment.
7043	if (VF.isScalar()) {
7044	Type *ValTy = getLoadStoreType(I);
7045	const Align Alignment = getLoadStoreAlignment(I);
7046	unsigned AS = getLoadStoreAddressSpace(I);
7047
7048	return TTI.getAddressComputationCost(ValTy) +
7049	TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7050	TTI::TCK_RecipThroughput, I);
7051	}
7052	return getWideningCost(I, VF);
7053	}
7054
7055	LoopVectorizationCostModel::VectorizationCostTy
7056	LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7057	ElementCount VF) {
7058	// If we know that this instruction will remain uniform, check the cost of
7059	// the scalar version.
7060	if (isUniformAfterVectorization(I, VF))
7061	VF = ElementCount::getFixed(1);
7062
7063	if (VF.isVector() && isProfitableToScalarize(I, VF))
7064	return VectorizationCostTy(InstsToScalarize[VF][I], false);
7065
7066	// Forced scalars do not have any scalarization overhead.
7067	auto ForcedScalar = ForcedScalars.find(VF);
7068	if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7069	auto InstSet = ForcedScalar->second;
7070	if (InstSet.count(I))
7071	return VectorizationCostTy(
7072	(getInstructionCost(I, ElementCount::getFixed(1)).first *
7073	VF.getKnownMinValue()),
7074	false);
7075	}
7076
7077	Type *VectorTy;
7078	InstructionCost C = getInstructionCost(I, VF, VectorTy);
7079
7080	bool TypeNotScalarized = false;
7081	if (VF.isVector() && VectorTy->isVectorTy()) {
7082	unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7083	if (NumParts)
7084	TypeNotScalarized = NumParts < VF.getKnownMinValue();
7085	else
7086	C = InstructionCost::getInvalid();
7087	}
7088	return VectorizationCostTy(C, TypeNotScalarized);
7089	}
7090
7091	InstructionCost
7092	LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7093	ElementCount VF) const {
7094
7095	// There is no mechanism yet to create a scalable scalarization loop,
7096	// so this is currently Invalid.
7097	if (VF.isScalable())
7098	return InstructionCost::getInvalid();
7099
7100	if (VF.isScalar())
7101	return 0;
7102
7103	InstructionCost Cost = 0;
7104	Type *RetTy = ToVectorTy(I->getType(), VF);
7105	if (!RetTy->isVoidTy() &&
7106	(!isa<LoadInst>(I) \|\| !TTI.supportsEfficientVectorElementLoadStore()))
7107	Cost += TTI.getScalarizationOverhead(
7108	cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7109	false);
7110
7111	// Some targets keep addresses scalar.
7112	if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7113	return Cost;
7114
7115	// Some targets support efficient element stores.
7116	if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7117	return Cost;
7118
7119	// Collect operands to consider.
7120	CallInst *CI = dyn_cast<CallInst>(I);
7121	Instruction::op_range Ops = CI ? CI->args() : I->operands();
7122
7123	// Skip operands that do not require extraction/scalarization and do not incur
7124	// any overhead.
7125	SmallVector<Type *> Tys;
7126	for (auto *V : filterExtractingOperands(Ops, VF))
7127	Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7128	return Cost + TTI.getOperandsScalarizationOverhead(
7129	filterExtractingOperands(Ops, VF), Tys);
7130	}
7131
7132	void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7133	if (VF.isScalar())
7134	return;
7135	NumPredStores = 0;
7136	for (BasicBlock *BB : TheLoop->blocks()) {
7137	// For each instruction in the old loop.
7138	for (Instruction &I : *BB) {
7139	Value *Ptr = getLoadStorePointerOperand(&I);
7140	if (!Ptr)
7141	continue;
7142
7143	// TODO: We should generate better code and update the cost model for
7144	// predicated uniform stores. Today they are treated as any other
7145	// predicated store (see added test cases in
7146	// invariant-store-vectorization.ll).
7147	if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
7148	NumPredStores++;
7149
7150	if (Legal->isUniformMemOp(I)) {
7151	// TODO: Avoid replicating loads and stores instead of
7152	// relying on instcombine to remove them.
7153	// Load: Scalar load + broadcast
7154	// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7155	InstructionCost Cost;
7156	if (isa<StoreInst>(&I) && VF.isScalable() &&
7157	isLegalGatherOrScatter(&I, VF)) {
7158	Cost = getGatherScatterCost(&I, VF);
7159	setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7160	} else {
7161	assert((isa<LoadInst>(&I) \|\| !VF.isScalable()) &&(static_cast <bool> ((isa<LoadInst>(&I) \|\| !VF .isScalable()) && "Cannot yet scalarize uniform stores" ) ? void (0) : __assert_fail ("(isa<LoadInst>(&I) \|\| !VF.isScalable()) && \"Cannot yet scalarize uniform stores\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7162, __extension__ __PRETTY_FUNCTION__))
7162	"Cannot yet scalarize uniform stores")(static_cast <bool> ((isa<LoadInst>(&I) \|\| !VF .isScalable()) && "Cannot yet scalarize uniform stores" ) ? void (0) : __assert_fail ("(isa<LoadInst>(&I) \|\| !VF.isScalable()) && \"Cannot yet scalarize uniform stores\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7162, __extension__ __PRETTY_FUNCTION__));
7163	Cost = getUniformMemOpCost(&I, VF);
7164	setWideningDecision(&I, VF, CM_Scalarize, Cost);
7165	}
7166	continue;
7167	}
7168
7169	// We assume that widening is the best solution when possible.
7170	if (memoryInstructionCanBeWidened(&I, VF)) {
7171	InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7172	int ConsecutiveStride = Legal->isConsecutivePtr(
7173	getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7174	assert((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && "Expected consecutive stride.") ? void (0) : __assert_fail ("(ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && \"Expected consecutive stride.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7175, __extension__ __PRETTY_FUNCTION__))
7175	"Expected consecutive stride.")(static_cast <bool> ((ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && "Expected consecutive stride.") ? void (0) : __assert_fail ("(ConsecutiveStride == 1 \|\| ConsecutiveStride == -1) && \"Expected consecutive stride.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7175, __extension__ __PRETTY_FUNCTION__));
7176	InstWidening Decision =
7177	ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7178	setWideningDecision(&I, VF, Decision, Cost);
7179	continue;
7180	}
7181
7182	// Choose between Interleaving, Gather/Scatter or Scalarization.
7183	InstructionCost InterleaveCost = InstructionCost::getInvalid();
7184	unsigned NumAccesses = 1;
7185	if (isAccessInterleaved(&I)) {
7186	auto Group = getInterleavedAccessGroup(&I);
7187	assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group." ) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7187, __extension__ __PRETTY_FUNCTION__));
7188
7189	// Make one decision for the whole group.
7190	if (getWideningDecision(&I, VF) != CM_Unknown)
7191	continue;
7192
7193	NumAccesses = Group->getNumMembers();
7194	if (interleavedAccessCanBeWidened(&I, VF))
7195	InterleaveCost = getInterleaveGroupCost(&I, VF);
7196	}
7197
7198	InstructionCost GatherScatterCost =
7199	isLegalGatherOrScatter(&I, VF)
7200	? getGatherScatterCost(&I, VF) * NumAccesses
7201	: InstructionCost::getInvalid();
7202
7203	InstructionCost ScalarizationCost =
7204	getMemInstScalarizationCost(&I, VF) * NumAccesses;
7205
7206	// Choose better solution for the current VF,
7207	// write down this decision and use it during vectorization.
7208	InstructionCost Cost;
7209	InstWidening Decision;
7210	if (InterleaveCost <= GatherScatterCost &&
7211	InterleaveCost < ScalarizationCost) {
7212	Decision = CM_Interleave;
7213	Cost = InterleaveCost;
7214	} else if (GatherScatterCost < ScalarizationCost) {
7215	Decision = CM_GatherScatter;
7216	Cost = GatherScatterCost;
7217	} else {
7218	Decision = CM_Scalarize;
7219	Cost = ScalarizationCost;
7220	}
7221	// If the instructions belongs to an interleave group, the whole group
7222	// receives the same decision. The whole group receives the cost, but
7223	// the cost will actually be assigned to one instruction.
7224	if (auto Group = getInterleavedAccessGroup(&I))
7225	setWideningDecision(Group, VF, Decision, Cost);
7226	else
7227	setWideningDecision(&I, VF, Decision, Cost);
7228	}
7229	}
7230
7231	// Make sure that any load of address and any other address computation
7232	// remains scalar unless there is gather/scatter support. This avoids
7233	// inevitable extracts into address registers, and also has the benefit of
7234	// activating LSR more, since that pass can't optimize vectorized
7235	// addresses.
7236	if (TTI.prefersVectorizedAddressing())
7237	return;
7238
7239	// Start with all scalar pointer uses.
7240	SmallPtrSet<Instruction *, 8> AddrDefs;
7241	for (BasicBlock *BB : TheLoop->blocks())
7242	for (Instruction &I : *BB) {
7243	Instruction *PtrDef =
7244	dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7245	if (PtrDef && TheLoop->contains(PtrDef) &&
7246	getWideningDecision(&I, VF) != CM_GatherScatter)
7247	AddrDefs.insert(PtrDef);
7248	}
7249
7250	// Add all instructions used to generate the addresses.
7251	SmallVector<Instruction *, 4> Worklist;
7252	append_range(Worklist, AddrDefs);
7253	while (!Worklist.empty()) {
7254	Instruction *I = Worklist.pop_back_val();
7255	for (auto &Op : I->operands())
7256	if (auto *InstOp = dyn_cast<Instruction>(Op))
7257	if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7258	AddrDefs.insert(InstOp).second)
7259	Worklist.push_back(InstOp);
7260	}
7261
7262	for (auto *I : AddrDefs) {
7263	if (isa<LoadInst>(I)) {
7264	// Setting the desired widening decision should ideally be handled in
7265	// by cost functions, but since this involves the task of finding out
7266	// if the loaded register is involved in an address computation, it is
7267	// instead changed here when we know this is the case.
7268	InstWidening Decision = getWideningDecision(I, VF);
7269	if (Decision == CM_Widen \|\| Decision == CM_Widen_Reverse)
7270	// Scalarize a widened load of address.
7271	setWideningDecision(
7272	I, VF, CM_Scalarize,
7273	(VF.getKnownMinValue() *
7274	getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7275	else if (auto Group = getInterleavedAccessGroup(I)) {
7276	// Scalarize an interleave group of address loads.
7277	for (unsigned I = 0; I < Group->getFactor(); ++I) {
7278	if (Instruction *Member = Group->getMember(I))
7279	setWideningDecision(
7280	Member, VF, CM_Scalarize,
7281	(VF.getKnownMinValue() *
7282	getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7283	}
7284	}
7285	} else
7286	// Make sure I gets scalarized and a cost estimate without
7287	// scalarization overhead.
7288	ForcedScalars[VF].insert(I);
7289	}
7290	}
7291
7292	InstructionCost
7293	LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7294	Type *&VectorTy) {
7295	Type *RetTy = I->getType();
7296	if (canTruncateToMinimalBitwidth(I, VF))
7297	RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7298	auto SE = PSE.getSE();
7299	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7300
7301	auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7302	ElementCount VF) -> bool {
7303	if (VF.isScalar())
7304	return true;
7305
7306	auto Scalarized = InstsToScalarize.find(VF);
7307	assert(Scalarized != InstsToScalarize.end() &&(static_cast <bool> (Scalarized != InstsToScalarize.end () && "VF not yet analyzed for scalarization profitability" ) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7308, __extension__ __PRETTY_FUNCTION__))
7308	"VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalarized != InstsToScalarize.end () && "VF not yet analyzed for scalarization profitability" ) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7308, __extension__ __PRETTY_FUNCTION__));
7309	return !Scalarized->second.count(I) &&
7310	llvm::all_of(I->users(), [&](User *U) {
7311	auto *UI = cast<Instruction>(U);
7312	return !Scalarized->second.count(UI);
7313	});
7314	};
7315	(void) hasSingleCopyAfterVectorization;
7316
7317	if (isScalarAfterVectorization(I, VF)) {
7318	// With the exception of GEPs and PHIs, after scalarization there should
7319	// only be one copy of the instruction generated in the loop. This is
7320	// because the VF is either 1, or any instructions that need scalarizing
7321	// have already been dealt with by the the time we get here. As a result,
7322	// it means we don't have to multiply the instruction cost by VF.
7323	assert(I->getOpcode() == Instruction::GetElementPtr \|\|(static_cast <bool> (I->getOpcode() == Instruction:: GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I-> getOpcode() == Instruction::BitCast && I->getType( )->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF )) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7327, __extension__ __PRETTY_FUNCTION__))
7324	I->getOpcode() == Instruction::PHI \|\|(static_cast <bool> (I->getOpcode() == Instruction:: GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I-> getOpcode() == Instruction::BitCast && I->getType( )->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF )) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7327, __extension__ __PRETTY_FUNCTION__))
7325	(I->getOpcode() == Instruction::BitCast &&(static_cast <bool> (I->getOpcode() == Instruction:: GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I-> getOpcode() == Instruction::BitCast && I->getType( )->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF )) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7327, __extension__ __PRETTY_FUNCTION__))
7326	I->getType()->isPointerTy()) \|\|(static_cast <bool> (I->getOpcode() == Instruction:: GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I-> getOpcode() == Instruction::BitCast && I->getType( )->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF )) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7327, __extension__ __PRETTY_FUNCTION__))
7327	hasSingleCopyAfterVectorization(I, VF))(static_cast <bool> (I->getOpcode() == Instruction:: GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I-> getOpcode() == Instruction::BitCast && I->getType( )->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF )) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr \|\| I->getOpcode() == Instruction::PHI \|\| (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) \|\| hasSingleCopyAfterVectorization(I, VF)" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7327, __extension__ __PRETTY_FUNCTION__));
7328	VectorTy = RetTy;
7329	} else
7330	VectorTy = ToVectorTy(RetTy, VF);
7331
7332	// TODO: We need to estimate the cost of intrinsic calls.
7333	switch (I->getOpcode()) {
7334	case Instruction::GetElementPtr:
7335	// We mark this instruction as zero-cost because the cost of GEPs in
7336	// vectorized code depends on whether the corresponding memory instruction
7337	// is scalarized or not. Therefore, we handle GEPs with the memory
7338	// instruction cost.
7339	return 0;
7340	case Instruction::Br: {
7341	// In cases of scalarized and predicated instructions, there will be VF
7342	// predicated blocks in the vectorized loop. Each branch around these
7343	// blocks requires also an extract of its vector compare i1 element.
7344	bool ScalarPredicatedBB = false;
7345	BranchInst *BI = cast<BranchInst>(I);
7346	if (VF.isVector() && BI->isConditional() &&
7347	(PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) \|\|
7348	PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7349	ScalarPredicatedBB = true;
7350
7351	if (ScalarPredicatedBB) {
7352	// Not possible to scalarize scalable vector with predicated instructions.
7353	if (VF.isScalable())
7354	return InstructionCost::getInvalid();
7355	// Return cost for branches around scalarized and predicated blocks.
7356	auto *Vec_i1Ty =
7357	VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7358	return (
7359	TTI.getScalarizationOverhead(
7360	Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7361	(TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7362	} else if (I->getParent() == TheLoop->getLoopLatch() \|\| VF.isScalar())
7363	// The back-edge branch will remain, as will all scalar branches.
7364	return TTI.getCFInstrCost(Instruction::Br, CostKind);
7365	else
7366	// This branch will be eliminated by if-conversion.
7367	return 0;
7368	// Note: We currently assume zero cost for an unconditional branch inside
7369	// a predicated block since it will become a fall-through, although we
7370	// may decide in the future to call TTI for all branches.
7371	}
7372	case Instruction::PHI: {
7373	auto *Phi = cast<PHINode>(I);
7374
7375	// First-order recurrences are replaced by vector shuffles inside the loop.
7376	// NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7377	if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7378	return TTI.getShuffleCost(
7379	TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7380	None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7381
7382	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7383	// converted into select instructions. We require N - 1 selects per phi
7384	// node, where N is the number of incoming values.
7385	if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7386	return (Phi->getNumIncomingValues() - 1) *
7387	TTI.getCmpSelInstrCost(
7388	Instruction::Select, ToVectorTy(Phi->getType(), VF),
7389	ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7390	CmpInst::BAD_ICMP_PREDICATE, CostKind);
7391
7392	return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7393	}
7394	case Instruction::UDiv:
7395	case Instruction::SDiv:
7396	case Instruction::URem:
7397	case Instruction::SRem:
7398	// If we have a predicated instruction, it may not be executed for each
7399	// vector lane. Get the scalarization cost and scale this amount by the
7400	// probability of executing the predicated block. If the instruction is not
7401	// predicated, we fall through to the next case.
7402	if (VF.isVector() && isScalarWithPredication(I, VF)) {
7403	InstructionCost Cost = 0;
7404
7405	// These instructions have a non-void type, so account for the phi nodes
7406	// that we will create. This cost is likely to be zero. The phi node
7407	// cost, if any, should be scaled by the block probability because it
7408	// models a copy at the end of each predicated block.
7409	Cost += VF.getKnownMinValue() *
7410	TTI.getCFInstrCost(Instruction::PHI, CostKind);
7411
7412	// The cost of the non-predicated instruction.
7413	Cost += VF.getKnownMinValue() *
7414	TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7415
7416	// The cost of insertelement and extractelement instructions needed for
7417	// scalarization.
7418	Cost += getScalarizationOverhead(I, VF);
7419
7420	// Scale the cost by the probability of executing the predicated blocks.
7421	// This assumes the predicated block for each vector lane is equally
7422	// likely.
7423	return Cost / getReciprocalPredBlockProb();
7424	}
7425	LLVM_FALLTHROUGH[[gnu::fallthrough]];
7426	case Instruction::Add:
7427	case Instruction::FAdd:
7428	case Instruction::Sub:
7429	case Instruction::FSub:
7430	case Instruction::Mul:
7431	case Instruction::FMul:
7432	case Instruction::FDiv:
7433	case Instruction::FRem:
7434	case Instruction::Shl:
7435	case Instruction::LShr:
7436	case Instruction::AShr:
7437	case Instruction::And:
7438	case Instruction::Or:
7439	case Instruction::Xor: {
7440	// Since we will replace the stride by 1 the multiplication should go away.
7441	if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7442	return 0;
7443
7444	// Detect reduction patterns
7445	if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7446	return *RedCost;
7447
7448	// Certain instructions can be cheaper to vectorize if they have a constant
7449	// second vector operand. One example of this are shifts on x86.
7450	Value *Op2 = I->getOperand(1);
7451	TargetTransformInfo::OperandValueProperties Op2VP;
7452	TargetTransformInfo::OperandValueKind Op2VK =
7453	TTI.getOperandInfo(Op2, Op2VP);
7454	if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7455	Op2VK = TargetTransformInfo::OK_UniformValue;
7456
7457	SmallVector<const Value *, 4> Operands(I->operand_values());
7458	return TTI.getArithmeticInstrCost(
7459	I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7460	Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7461	}
7462	case Instruction::FNeg: {
7463	return TTI.getArithmeticInstrCost(
7464	I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7465	TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7466	TargetTransformInfo::OP_None, I->getOperand(0), I);
7467	}
7468	case Instruction::Select: {
7469	SelectInst *SI = cast<SelectInst>(I);
7470	const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7471	bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7472
7473	const Value Op0, Op1;
7474	using namespace llvm::PatternMatch;
7475	if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) \|\|
7476	match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7477	// select x, y, false --> x & y
7478	// select x, true, y --> x \| y
7479	TTI::OperandValueProperties Op1VP = TTI::OP_None;
7480	TTI::OperandValueProperties Op2VP = TTI::OP_None;
7481	TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7482	TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7483	assert(Op0->getType()->getScalarSizeInBits() == 1 &&(static_cast <bool> (Op0->getType()->getScalarSizeInBits () == 1 && Op1->getType()->getScalarSizeInBits( ) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7484, __extension__ __PRETTY_FUNCTION__))
7484	Op1->getType()->getScalarSizeInBits() == 1)(static_cast <bool> (Op0->getType()->getScalarSizeInBits () == 1 && Op1->getType()->getScalarSizeInBits( ) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7484, __extension__ __PRETTY_FUNCTION__));
7485
7486	SmallVector<const Value *, 2> Operands{Op0, Op1};
7487	return TTI.getArithmeticInstrCost(
7488	match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7489	CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7490	}
7491
7492	Type *CondTy = SI->getCondition()->getType();
7493	if (!ScalarCond)
7494	CondTy = VectorType::get(CondTy, VF);
7495
7496	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7497	if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7498	Pred = Cmp->getPredicate();
7499	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7500	CostKind, I);
7501	}
7502	case Instruction::ICmp:
7503	case Instruction::FCmp: {
7504	Type *ValTy = I->getOperand(0)->getType();
7505	Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7506	if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7507	ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7508	VectorTy = ToVectorTy(ValTy, VF);
7509	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7510	cast<CmpInst>(I)->getPredicate(), CostKind,
7511	I);
7512	}
7513	case Instruction::Store:
7514	case Instruction::Load: {
7515	ElementCount Width = VF;
7516	if (Width.isVector()) {
7517	InstWidening Decision = getWideningDecision(I, Width);
7518	assert(Decision != CM_Unknown &&(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point" ) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7519, __extension__ __PRETTY_FUNCTION__))
7519	"CM decision should be taken at this point")(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point" ) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7519, __extension__ __PRETTY_FUNCTION__));
7520	if (Decision == CM_Scalarize)
7521	Width = ElementCount::getFixed(1);
7522	}
7523	VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7524	return getMemoryInstructionCost(I, VF);
7525	}
7526	case Instruction::BitCast:
7527	if (I->getType()->isPointerTy())
7528	return 0;
7529	LLVM_FALLTHROUGH[[gnu::fallthrough]];
7530	case Instruction::ZExt:
7531	case Instruction::SExt:
7532	case Instruction::FPToUI:
7533	case Instruction::FPToSI:
7534	case Instruction::FPExt:
7535	case Instruction::PtrToInt:
7536	case Instruction::IntToPtr:
7537	case Instruction::SIToFP:
7538	case Instruction::UIToFP:
7539	case Instruction::Trunc:
7540	case Instruction::FPTrunc: {
7541	// Computes the CastContextHint from a Load/Store instruction.
7542	auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7543	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) \|\| isa< StoreInst>(I)) && "Expected a load or a store!") ? void (0) : __assert_fail ("(isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && \"Expected a load or a store!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7544, __extension__ __PRETTY_FUNCTION__))
7544	"Expected a load or a store!")(static_cast <bool> ((isa<LoadInst>(I) \|\| isa< StoreInst>(I)) && "Expected a load or a store!") ? void (0) : __assert_fail ("(isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && \"Expected a load or a store!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7544, __extension__ __PRETTY_FUNCTION__));
7545
7546	if (VF.isScalar() \|\| !TheLoop->contains(I))
7547	return TTI::CastContextHint::Normal;
7548
7549	switch (getWideningDecision(I, VF)) {
7550	case LoopVectorizationCostModel::CM_GatherScatter:
7551	return TTI::CastContextHint::GatherScatter;
7552	case LoopVectorizationCostModel::CM_Interleave:
7553	return TTI::CastContextHint::Interleave;
7554	case LoopVectorizationCostModel::CM_Scalarize:
7555	case LoopVectorizationCostModel::CM_Widen:
7556	return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7557	: TTI::CastContextHint::Normal;
7558	case LoopVectorizationCostModel::CM_Widen_Reverse:
7559	return TTI::CastContextHint::Reversed;
7560	case LoopVectorizationCostModel::CM_Unknown:
7561	llvm_unreachable("Instr did not go through cost modelling?")::llvm::llvm_unreachable_internal("Instr did not go through cost modelling?" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7561);
7562	}
7563
7564	llvm_unreachable("Unhandled case!")::llvm::llvm_unreachable_internal("Unhandled case!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 7564);
7565	};
7566
7567	unsigned Opcode = I->getOpcode();
7568	TTI::CastContextHint CCH = TTI::CastContextHint::None;
7569	// For Trunc, the context is the only user, which must be a StoreInst.
7570	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
7571	if (I->hasOneUse())
7572	if (StoreInst Store = dyn_cast<StoreInst>(I->user_begin()))
7573	CCH = ComputeCCH(Store);
7574	}
7575	// For Z/Sext, the context is the operand, which must be a LoadInst.
7576	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
7577	Opcode == Instruction::FPExt) {
7578	if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7579	CCH = ComputeCCH(Load);
7580	}
7581
7582	// We optimize the truncation of induction variables having constant
7583	// integer steps. The cost of these truncations is the same as the scalar
7584	// operation.
7585	if (isOptimizableIVTruncate(I, VF)) {
7586	auto *Trunc = cast<TruncInst>(I);
7587	return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7588	Trunc->getSrcTy(), CCH, CostKind, Trunc);
7589	}
7590
7591	// Detect reduction patterns
7592	if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7593	return *RedCost;
7594
7595	Type *SrcScalarTy = I->getOperand(0)->getType();
7596	Type *SrcVecTy =
7597	VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7598	if (canTruncateToMinimalBitwidth(I, VF)) {
7599	// This cast is going to be shrunk. This may remove the cast or it might
7600	// turn it into slightly different cast. For example, if MinBW == 16,
7601	// "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7602	//
7603	// Calculate the modified src and dest types.
7604	Type *MinVecTy = VectorTy;
7605	if (Opcode == Instruction::Trunc) {
7606	SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7607	VectorTy =
7608	largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7609	} else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
7610	SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7611	VectorTy =
7612	smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7613	}
7614	}
7615
7616	return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7617	}
7618	case Instruction::Call: {
7619	if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7620	if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7621	return *RedCost;
7622	bool NeedToScalarize;
7623	CallInst *CI = cast<CallInst>(I);
7624	InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7625	if (getVectorIntrinsicIDForCall(CI, TLI)) {
7626	InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7627	return std::min(CallCost, IntrinsicCost);
7628	}
7629	return CallCost;
7630	}
7631	case Instruction::ExtractValue:
7632	return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7633	case Instruction::Alloca:
7634	// We cannot easily widen alloca to a scalable alloca, as
7635	// the result would need to be a vector of pointers.
7636	if (VF.isScalable())
7637	return InstructionCost::getInvalid();
7638	LLVM_FALLTHROUGH[[gnu::fallthrough]];
7639	default:
7640	// This opcode is unknown. Assume that it is the same as 'mul'.
7641	return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7642	} // end of switch.
7643	}
7644
7645	char LoopVectorize::ID = 0;
7646
7647	static const char lv_name[] = "Loop Vectorization";
7648
7649	INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry & Registry) {
7650	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
7651	INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
7652	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
7653	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
7654	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
7655	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
7656	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
7657	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
7658	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
7659	INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
7660	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
7661	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
7662	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
7663	INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
7664	INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo PI = new PassInfo( lv_name, "loop-vectorize", & LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize>), false, false); Registry.registerPass(PI, true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag ; void llvm::initializeLoopVectorizePass(PassRegistry &Registry ) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce , std::ref(Registry)); }
7665
7666	namespace llvm {
7667
7668	Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7669
7670	Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7671	bool VectorizeOnlyWhenForced) {
7672	return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7673	}
7674
7675	} // end namespace llvm
7676
7677	bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7678	// Check if the pointer operand of a load or store instruction is
7679	// consecutive.
7680	if (auto *Ptr = getLoadStorePointerOperand(Inst))
7681	return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7682	return false;
7683	}
7684
7685	void LoopVectorizationCostModel::collectValuesToIgnore() {
7686	// Ignore ephemeral values.
7687	CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7688
7689	// Ignore type-promoting instructions we identified during reduction
7690	// detection.
7691	for (auto &Reduction : Legal->getReductionVars()) {
7692	const RecurrenceDescriptor &RedDes = Reduction.second;
7693	const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7694	VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7695	}
7696	// Ignore type-casting instructions we identified during induction
7697	// detection.
7698	for (auto &Induction : Legal->getInductionVars()) {
7699	const InductionDescriptor &IndDes = Induction.second;
7700	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7701	VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7702	}
7703	}
7704
7705	void LoopVectorizationCostModel::collectInLoopReductions() {
7706	for (auto &Reduction : Legal->getReductionVars()) {
7707	PHINode *Phi = Reduction.first;
7708	const RecurrenceDescriptor &RdxDesc = Reduction.second;
7709
7710	// We don't collect reductions that are type promoted (yet).
7711	if (RdxDesc.getRecurrenceType() != Phi->getType())
7712	continue;
7713
7714	// If the target would prefer this reduction to happen "in-loop", then we
7715	// want to record it as such.
7716	unsigned Opcode = RdxDesc.getOpcode();
7717	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7718	!TTI.preferInLoopReduction(Opcode, Phi->getType(),
7719	TargetTransformInfo::ReductionFlags()))
7720	continue;
7721
7722	// Check that we can correctly put the reductions into the loop, by
7723	// finding the chain of operations that leads from the phi to the loop
7724	// exit value.
7725	SmallVector<Instruction *, 4> ReductionOperations =
7726	RdxDesc.getReductionOpChain(Phi, TheLoop);
7727	bool InLoop = !ReductionOperations.empty();
7728	if (InLoop) {
7729	InLoopReductionChains[Phi] = ReductionOperations;
7730	// Add the elements to InLoopReductionImmediateChains for cost modelling.
7731	Instruction *LastChain = Phi;
7732	for (auto *I : ReductionOperations) {
7733	InLoopReductionImmediateChains[I] = LastChain;
7734	LastChain = I;
7735	}
7736	}
7737	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using " << ( InLoop ? "inloop" : "out of loop") << " reduction for phi: " << *Phi << "\n"; } } while (false)
7738	<< " reduction for phi: " << Phi << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using " << ( InLoop ? "inloop" : "out of loop") << " reduction for phi: " << Phi << "\n"; } } while (false);
7739	}
7740	}
7741
7742	// TODO: we could return a pair of values that specify the max VF and
7743	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7744	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7745	// doesn't have a cost model that can choose which plan to execute if
7746	// more than one is generated.
7747	static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7748	LoopVectorizationCostModel &CM) {
7749	unsigned WidestType;
7750	std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7751	return WidestVectorRegBits / WidestType;
7752	}
7753
7754	VectorizationFactor
7755	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7756	assert(!UserVF.isScalable() && "scalable vectors not yet supported")(static_cast <bool> (!UserVF.isScalable() && "scalable vectors not yet supported" ) ? void (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectors not yet supported\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7756, __extension__ __PRETTY_FUNCTION__));
7757	ElementCount VF = UserVF;
7758	// Outer loop handling: They may require CFG and instruction level
7759	// transformations before even evaluating whether vectorization is profitable.
7760	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
7761	// the vectorization pipeline.
7762	if (!OrigLoop->isInnermost()) {
7763	// If the user doesn't provide a vectorization factor, determine a
7764	// reasonable one.
7765	if (UserVF.isZero()) {
7766	VF = ElementCount::getFixed(determineVPlanVF(
7767	TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7768	.getFixedSize(),
7769	CM));
7770	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: VPlan computed VF " << VF << ".\n"; } } while (false);
7771
7772	// Make sure we have a VF > 1 for stress testing.
7773	if (VPlanBuildStressTest && (VF.isScalar() \|\| VF.isZero())) {
7774	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"; } } while (false)
7775	<< "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: " << "overriding computed VF.\n"; } } while (false);
7776	VF = ElementCount::getFixed(4);
7777	}
7778	}
7779	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled." ) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7779, __extension__ __PRETTY_FUNCTION__));
7780	assert(isPowerOf2_32(VF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue( )) && "VF needs to be a power of two") ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7781, __extension__ __PRETTY_FUNCTION__))
7781	"VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue( )) && "VF needs to be a power of two") ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7781, __extension__ __PRETTY_FUNCTION__));
7782	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using " << ( !UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"; } } while (false)
7783	<< "VF " << VF << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using " << ( !UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"; } } while (false);
7784	buildVPlans(VF, VF);
7785
7786	// For VPlan build stress testing, we bail out after VPlan construction.
7787	if (VPlanBuildStressTest)
7788	return VectorizationFactor::Disabled();
7789
7790	return {VF, 0 /Cost/};
7791	}
7792
7793	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"; } } while (false)
7794	dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"; } } while (false)
7795	"VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"; } } while (false);
7796	return VectorizationFactor::Disabled();
7797	}
7798
7799	Optional<VectorizationFactor>
7800	LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7801	assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() && "Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7801, __extension__ __PRETTY_FUNCTION__));
7802	FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7803	if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7804	return None;
7805
7806	// Invalidate interleave groups if all blocks of loop will be predicated.
7807	if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7808	!useMaskedInterleavedAccesses(*TTI)) {
7809	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"; } } while (false )
7810	dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"; } } while (false )
7811	<< "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"; } } while (false )
7812	"which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"; } } while (false );
7813	if (CM.InterleaveInfo.invalidateGroups())
7814	// Invalidating interleave groups also requires invalidating all decisions
7815	// based on them, which includes widening decisions and uniform and scalar
7816	// values.
7817	CM.invalidateCostModelingDecisions();
7818	}
7819
7820	ElementCount MaxUserVF =
7821	UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7822	bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7823	if (!UserVF.isZero() && UserVFIsLegal) {
7824	assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue ()) && "VF needs to be a power of two") ? void (0) : __assert_fail ("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7825, __extension__ __PRETTY_FUNCTION__))
7825	"VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue ()) && "VF needs to be a power of two") ? void (0) : __assert_fail ("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7825, __extension__ __PRETTY_FUNCTION__));
7826	// Collect the instructions (and their associated costs) that will be more
7827	// profitable to scalarize.
7828	if (CM.selectUserVectorizationFactor(UserVF)) {
7829	LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using user VF " << UserVF << ".\n"; } } while (false);
7830	CM.collectInLoopReductions();
7831	buildVPlansWithVPRecipes(UserVF, UserVF);
7832	LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { printPlans(dbgs()); } } while (false);
7833	return {{UserVF, 0}};
7834	} else
7835	reportVectorizationInfo("UserVF ignored because of invalid costs.",
7836	"InvalidCost", ORE, OrigLoop);
7837	}
7838
7839	// Populate the set of Vectorization Factor Candidates.
7840	ElementCountSet VFCandidates;
7841	for (auto VF = ElementCount::getFixed(1);
7842	ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7843	VFCandidates.insert(VF);
7844	for (auto VF = ElementCount::getScalable(1);
7845	ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7846	VFCandidates.insert(VF);
7847
7848	for (const auto &VF : VFCandidates) {
7849	// Collect Uniform and Scalar instructions after vectorization with VF.
7850	CM.collectUniformsAndScalars(VF);
7851
7852	// Collect the instructions (and their associated costs) that will be more
7853	// profitable to scalarize.
7854	if (VF.isVector())
7855	CM.collectInstsToScalarize(VF);
7856	}
7857
7858	CM.collectInLoopReductions();
7859	buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7860	buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7861
7862	LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { printPlans(dbgs()); } } while (false);
7863	if (!MaxFactors.hasVector())
7864	return VectorizationFactor::Disabled();
7865
7866	// Select the optimal vectorization factor.
7867	auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7868
7869	// Check if it is profitable to vectorize with runtime checks.
7870	unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7871	if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7872	bool PragmaThresholdReached =
7873	NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7874	bool ThresholdReached =
7875	NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7876	if ((ThresholdReached && !Hints.allowReordering()) \|\|
7877	PragmaThresholdReached) {
7878	ORE->emit([&]() {
7879	return OptimizationRemarkAnalysisAliasing(
7880	DEBUG_TYPE"loop-vectorize", "CantReorderMemOps", OrigLoop->getStartLoc(),
7881	OrigLoop->getHeader())
7882	<< "loop not vectorized: cannot prove it is safe to reorder "
7883	"memory operations";
7884	});
7885	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Too many memory checks needed.\n" ; } } while (false);
7886	Hints.emitRemarkWithHints();
7887	return VectorizationFactor::Disabled();
7888	}
7889	}
7890	return SelectedVF;
7891	}
7892
7893	VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7894	assert(count_if(VPlans,(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan." ) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7897, __extension__ __PRETTY_FUNCTION__))
7895	[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan." ) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7897, __extension__ __PRETTY_FUNCTION__))
7896	1 &&(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan." ) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7897, __extension__ __PRETTY_FUNCTION__))
7897	"Best VF has not a single VPlan.")(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan." ) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7897, __extension__ __PRETTY_FUNCTION__));
7898
7899	for (const VPlanPtr &Plan : VPlans) {
7900	if (Plan->hasVF(VF))
7901	return *Plan.get();
7902	}
7903	llvm_unreachable("No plan found!")::llvm::llvm_unreachable_internal("No plan found!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 7903);
7904	}
7905
7906	static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7907	SmallVector<Metadata *, 4> MDs;
7908	// Reserve first location for self reference to the LoopID metadata node.
7909	MDs.push_back(nullptr);
7910	bool IsUnrollMetadata = false;
7911	MDNode *LoopID = L->getLoopID();
7912	if (LoopID) {
7913	// First find existing loop unrolling disable metadata.
7914	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7915	auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7916	if (MD) {
7917	const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7918	IsUnrollMetadata =
7919	S && S->getString().startswith("llvm.loop.unroll.disable");
7920	}
7921	MDs.push_back(LoopID->getOperand(i));
7922	}
7923	}
7924
7925	if (!IsUnrollMetadata) {
7926	// Add runtime unroll disable metadata.
7927	LLVMContext &Context = L->getHeader()->getContext();
7928	SmallVector<Metadata *, 1> DisableOperands;
7929	DisableOperands.push_back(
7930	MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7931	MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7932	MDs.push_back(DisableNode);
7933	MDNode *NewLoopID = MDNode::get(Context, MDs);
7934	// Set operand 0 to refer to the loop id itself.
7935	NewLoopID->replaceOperandWith(0, NewLoopID);
7936	L->setLoopID(NewLoopID);
7937	}
7938	}
7939
7940	void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7941	VPlan &BestVPlan,
7942	InnerLoopVectorizer &ILV,
7943	DominatorTree *DT) {
7944	LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n' ; } } while (false)
7945	<< '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n' ; } } while (false);
7946
7947	// Perform the actual loop transformation.
7948
7949	// 1. Create a new empty loop. Unlink the old loop and connect the new one.
7950	VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7951	Value *CanonicalIVStartValue;
7952	std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7953	ILV.createVectorizedLoopSkeleton();
7954	ILV.collectPoisonGeneratingRecipes(State);
7955
7956	ILV.printDebugTracesAtStart();
7957
7958	//===------------------------------------------------===//
7959	//
7960	// Notice: any optimization or new instruction that go
7961	// into the code below should also be implemented in
7962	// the cost-model.
7963	//
7964	//===------------------------------------------------===//
7965
7966	// 2. Copy and widen instructions from the old loop into the new loop.
7967	BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7968	ILV.getOrCreateVectorTripCount(nullptr),
7969	CanonicalIVStartValue, State);
7970	BestVPlan.execute(&State);
7971
7972	// Keep all loop hints from the original loop on the vector loop (we'll
7973	// replace the vectorizer-specific hints below).
7974	MDNode *OrigLoopID = OrigLoop->getLoopID();
7975
7976	Optional<MDNode *> VectorizedLoopID =
7977	makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7978	LLVMLoopVectorizeFollowupVectorized});
7979
7980	Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7981	if (VectorizedLoopID.hasValue())
7982	L->setLoopID(VectorizedLoopID.getValue());
7983	else {
7984	// Keep all loop hints from the original loop on the vector loop (we'll
7985	// replace the vectorizer-specific hints below).
7986	if (MDNode *LID = OrigLoop->getLoopID())
7987	L->setLoopID(LID);
7988
7989	LoopVectorizeHints Hints(L, true, *ORE);
7990	Hints.setAlreadyVectorized();
7991	}
7992	// Disable runtime unrolling when vectorizing the epilogue loop.
7993	if (CanonicalIVStartValue)
7994	AddRuntimeUnrollDisableMetaData(L);
7995
7996	// 3. Fix the vectorized code: take care of header phi's, live-outs,
7997	// predication, updating analyses.
7998	ILV.fixVectorizedLoop(State);
7999
8000	ILV.printDebugTracesAtEnd();
8001	}
8002
8003	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
8004	void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8005	for (const auto &Plan : VPlans)
8006	if (PrintVPlansInDotFormat)
8007	Plan->printDOT(O);
8008	else
8009	Plan->print(O);
8010	}
8011	#endif
8012
8013	void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8014	SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8015
8016	// We create new control-flow for the vectorized loop, so the original exit
8017	// conditions will be dead after vectorization if it's only used by the
8018	// terminator
8019	SmallVector<BasicBlock*> ExitingBlocks;
8020	OrigLoop->getExitingBlocks(ExitingBlocks);
8021	for (auto *BB : ExitingBlocks) {
8022	auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8023	if (!Cmp \|\| !Cmp->hasOneUse())
8024	continue;
8025
8026	// TODO: we should introduce a getUniqueExitingBlocks on Loop
8027	if (!DeadInstructions.insert(Cmp).second)
8028	continue;
8029
8030	// The operands of the icmp is often a dead trunc, used by IndUpdate.
8031	// TODO: can recurse through operands in general
8032	for (Value *Op : Cmp->operands()) {
8033	if (isa<TruncInst>(Op) && Op->hasOneUse())
8034	DeadInstructions.insert(cast<Instruction>(Op));
8035	}
8036	}
8037
8038	// We create new "steps" for induction variable updates to which the original
8039	// induction variables map. An original update instruction will be dead if
8040	// all its users except the induction variable are dead.
8041	auto *Latch = OrigLoop->getLoopLatch();
8042	for (auto &Induction : Legal->getInductionVars()) {
8043	PHINode *Ind = Induction.first;
8044	auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8045
8046	// If the tail is to be folded by masking, the primary induction variable,
8047	// if exists, isn't dead: it will be used for masking. Don't kill it.
8048	if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8049	continue;
8050
8051	if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8052	return U == Ind \|\| DeadInstructions.count(cast<Instruction>(U));
8053	}))
8054	DeadInstructions.insert(IndUpdate);
8055	}
8056	}
8057
8058	Value InnerLoopUnroller::getBroadcastInstrs(Value V) { return V; }
8059
8060	//===--------------------------------------------------------------------===//
8061	// EpilogueVectorizerMainLoop
8062	//===--------------------------------------------------------------------===//
8063
8064	/// This function is partially responsible for generating the control flow
8065	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8066	std::pair<BasicBlock , Value >
8067	EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8068	MDNode *OrigLoopID = OrigLoop->getLoopID();
8069	Loop *Lp = createVectorLoopSkeleton("");
8070
8071	// Generate the code to check the minimum iteration count of the vector
8072	// epilogue (see below).
8073	EPI.EpilogueIterationCountCheck =
8074	emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8075	EPI.EpilogueIterationCountCheck->setName("iter.check");
8076
8077	// Generate the code to check any assumptions that we've made for SCEV
8078	// expressions.
8079	EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8080
8081	// Generate the code that checks at runtime if arrays overlap. We put the
8082	// checks into a separate block to make the more common case of few elements
8083	// faster.
8084	EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8085
8086	// Generate the iteration count check for the main loop, after the check
8087	// for the epilogue loop, so that the path-length is shorter for the case
8088	// that goes directly through the vector epilogue. The longer-path length for
8089	// the main loop is compensated for, by the gain from vectorizing the larger
8090	// trip count. Note: the branch will get updated later on when we vectorize
8091	// the epilogue.
8092	EPI.MainLoopIterationCountCheck =
8093	emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8094
8095	// Generate the induction variable.
8096	Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8097	EPI.VectorTripCount = CountRoundDown;
8098	createHeaderBranch(Lp);
8099
8100	// Skip induction resume value creation here because they will be created in
8101	// the second pass. If we created them here, they wouldn't be used anyway,
8102	// because the vplan in the second pass still contains the inductions from the
8103	// original loop.
8104
8105	return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
8106	}
8107
8108	void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8109	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8110	dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8111	<< "Main Loop VF:" << EPI.MainLoopVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8112	<< ", Main Loop UF:" << EPI.MainLoopUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8113	<< ", Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8114	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8115	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" << "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false);
8116	}
8117
8118	void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8119	DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "intermediate fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }; } } while (false)
8120	dbgs() << "intermediate fn:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "intermediate fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }; } } while (false)
8121	<< OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "intermediate fn:\n" << OrigLoop->getHeader()->getParent() << "\n"; }; } } while (false)
8122	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "intermediate fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }; } } while (false);
8123	}
8124
8125	BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8126	Loop L, BasicBlock Bypass, bool ForEpilogue) {
8127	assert(L && "Expected valid Loop.")(static_cast <bool> (L && "Expected valid Loop." ) ? void (0) : __assert_fail ("L && \"Expected valid Loop.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8127, __extension__ __PRETTY_FUNCTION__));
8128	assert(Bypass && "Expected valid bypass basic block.")(static_cast <bool> (Bypass && "Expected valid bypass basic block." ) ? void (0) : __assert_fail ("Bypass && \"Expected valid bypass basic block.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8128, __extension__ __PRETTY_FUNCTION__));
8129	ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8130	unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8131	Value *Count = getOrCreateTripCount(L);
8132	// Reuse existing vector loop preheader for TC checks.
8133	// Note that new preheader block is generated for vector loop.
8134	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8135	IRBuilder<> Builder(TCCheckBlock->getTerminator());
8136
8137	// Generate code to check if the loop's trip count is less than VF * UF of the
8138	// main vector loop.
8139	auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8140	ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8141
8142	Value *CheckMinIters = Builder.CreateICmp(
8143	P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8144	"min.iters.check");
8145
8146	if (!ForEpilogue)
8147	TCCheckBlock->setName("vector.main.loop.iter.check");
8148
8149	// Create new preheader for vector loop.
8150	LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8151	DT, LI, nullptr, "vector.ph");
8152
8153	if (ForEpilogue) {
8154	assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8156, __extension__ __PRETTY_FUNCTION__))
8155	DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8156, __extension__ __PRETTY_FUNCTION__))
8156	"TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode (TCCheckBlock), DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass") ? void (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8156, __extension__ __PRETTY_FUNCTION__));
8157
8158	// Update dominator for Bypass & LoopExit.
8159	DT->changeImmediateDominator(Bypass, TCCheckBlock);
8160	if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8161	// For loops with multiple exits, there's no edge from the middle block
8162	// to exit blocks (as the epilogue must run) and thus no need to update
8163	// the immediate dominator of the exit blocks.
8164	DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8165
8166	LoopBypassBlocks.push_back(TCCheckBlock);
8167
8168	// Save the trip count so we don't have to regenerate it in the
8169	// vec.epilog.iter.check. This is safe to do because the trip count
8170	// generated here dominates the vector epilog iter check.
8171	EPI.TripCount = Count;
8172	}
8173
8174	ReplaceInstWithInst(
8175	TCCheckBlock->getTerminator(),
8176	BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8177
8178	return TCCheckBlock;
8179	}
8180
8181	//===--------------------------------------------------------------------===//
8182	// EpilogueVectorizerEpilogueLoop
8183	//===--------------------------------------------------------------------===//
8184
8185	/// This function is partially responsible for generating the control flow
8186	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8187	std::pair<BasicBlock , Value >
8188	EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8189	MDNode *OrigLoopID = OrigLoop->getLoopID();
8190	Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8191
8192	// Now, compare the remaining count and if there aren't enough iterations to
8193	// execute the vectorized epilogue skip to the scalar part.
8194	BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8195	VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8196	LoopVectorPreHeader =
8197	SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8198	LI, nullptr, "vec.epilog.ph");
8199	emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8200	VecEpilogueIterationCountCheck);
8201
8202	// Adjust the control flow taking the state info from the main loop
8203	// vectorization into account.
8204	assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&(static_cast <bool> (EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass." ) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8205, __extension__ __PRETTY_FUNCTION__))
8205	"expected this to be saved from the previous pass.")(static_cast <bool> (EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass." ) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8205, __extension__ __PRETTY_FUNCTION__));
8206	EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8207	VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8208
8209	DT->changeImmediateDominator(LoopVectorPreHeader,
8210	EPI.MainLoopIterationCountCheck);
8211
8212	EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8213	VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8214
8215	if (EPI.SCEVSafetyCheck)
8216	EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8217	VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8218	if (EPI.MemSafetyCheck)
8219	EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8220	VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8221
8222	DT->changeImmediateDominator(
8223	VecEpilogueIterationCountCheck,
8224	VecEpilogueIterationCountCheck->getSinglePredecessor());
8225
8226	DT->changeImmediateDominator(LoopScalarPreHeader,
8227	EPI.EpilogueIterationCountCheck);
8228	if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8229	// If there is an epilogue which must run, there's no edge from the
8230	// middle block to exit blocks and thus no need to update the immediate
8231	// dominator of the exit blocks.
8232	DT->changeImmediateDominator(LoopExitBlock,
8233	EPI.EpilogueIterationCountCheck);
8234
8235	// Keep track of bypass blocks, as they feed start values to the induction
8236	// phis in the scalar loop preheader.
8237	if (EPI.SCEVSafetyCheck)
8238	LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8239	if (EPI.MemSafetyCheck)
8240	LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8241	LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8242
8243	// The vec.epilog.iter.check block may contain Phi nodes from reductions which
8244	// merge control-flow from the latch block and the middle block. Update the
8245	// incoming values here and move the Phi into the preheader.
8246	SmallVector<PHINode *, 4> PhisInBlock;
8247	for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8248	PhisInBlock.push_back(&Phi);
8249
8250	for (PHINode *Phi : PhisInBlock) {
8251	Phi->replaceIncomingBlockWith(
8252	VecEpilogueIterationCountCheck->getSinglePredecessor(),
8253	VecEpilogueIterationCountCheck);
8254	Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8255	if (EPI.SCEVSafetyCheck)
8256	Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8257	if (EPI.MemSafetyCheck)
8258	Phi->removeIncomingValue(EPI.MemSafetyCheck);
8259	Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8260	}
8261
8262	// Generate a resume induction for the vector epilogue and put it in the
8263	// vector epilogue preheader
8264	Type *IdxTy = Legal->getWidestInductionType();
8265	PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8266	LoopVectorPreHeader->getFirstNonPHI());
8267	EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8268	EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8269	EPI.MainLoopIterationCountCheck);
8270
8271	// Generate the induction variable.
8272	createHeaderBranch(Lp);
8273
8274	// Generate induction resume values. These variables save the new starting
8275	// indexes for the scalar loop. They are used to test if there are any tail
8276	// iterations left once the vector loop has completed.
8277	// Note that when the vectorized epilogue is skipped due to iteration count
8278	// check, then the resume value for the induction variable comes from
8279	// the trip count of the main vector loop, hence passing the AdditionalBypass
8280	// argument.
8281	createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8282	EPI.VectorTripCount} /* AdditionalBypass */);
8283
8284	return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8285	}
8286
8287	BasicBlock *
8288	EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8289	Loop L, BasicBlock Bypass, BasicBlock *Insert) {
8290
8291	assert(EPI.TripCount &&(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass." ) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8292, __extension__ __PRETTY_FUNCTION__))
8292	"Expected trip count to have been safed in the first pass.")(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass." ) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8292, __extension__ __PRETTY_FUNCTION__));
8293	assert((static_cast <bool> ((!isa<Instruction>(EPI.TripCount ) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)-> getParent(), Insert)) && "saved trip count does not dominate insertion point." ) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8296, __extension__ __PRETTY_FUNCTION__))
8294	(!isa<Instruction>(EPI.TripCount) \|\|(static_cast <bool> ((!isa<Instruction>(EPI.TripCount ) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)-> getParent(), Insert)) && "saved trip count does not dominate insertion point." ) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8296, __extension__ __PRETTY_FUNCTION__))
8295	DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&(static_cast <bool> ((!isa<Instruction>(EPI.TripCount ) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)-> getParent(), Insert)) && "saved trip count does not dominate insertion point." ) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8296, __extension__ __PRETTY_FUNCTION__))
8296	"saved trip count does not dominate insertion point.")(static_cast <bool> ((!isa<Instruction>(EPI.TripCount ) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)-> getParent(), Insert)) && "saved trip count does not dominate insertion point." ) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) \|\| DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8296, __extension__ __PRETTY_FUNCTION__));
8297	Value *TC = EPI.TripCount;
8298	IRBuilder<> Builder(Insert->getTerminator());
8299	Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8300
8301	// Generate code to check if the loop's trip count is less than VF * UF of the
8302	// vector epilogue loop.
8303	auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8304	ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8305
8306	Value *CheckMinIters =
8307	Builder.CreateICmp(P, Count,
8308	createStepForVF(Builder, Count->getType(),
8309	EPI.EpilogueVF, EPI.EpilogueUF),
8310	"min.epilog.iters.check");
8311
8312	ReplaceInstWithInst(
8313	Insert->getTerminator(),
8314	BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8315
8316	LoopBypassBlocks.push_back(Insert);
8317	return Insert;
8318	}
8319
8320	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8321	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8322	dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8323	<< "Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8324	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false)
8325	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" << "Epilogue Loop VF:" << EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; }; } } while (false);
8326	}
8327
8328	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8329	DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop ->getHeader()->getParent() << "\n"; }; } } while ( false)
8330	dbgs() << "final fn:\n" << OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "final fn:\n" << OrigLoop ->getHeader()->getParent() << "\n"; }; } } while ( false)
8331	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType (VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop ->getHeader()->getParent() << "\n"; }; } } while ( false);
8332	}
8333
8334	bool LoopVectorizationPlanner::getDecisionAndClampRange(
8335	const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8336	assert(!Range.isEmpty() && "Trying to test an empty VF range.")(static_cast <bool> (!Range.isEmpty() && "Trying to test an empty VF range." ) ? void (0) : __assert_fail ("!Range.isEmpty() && \"Trying to test an empty VF range.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8336, __extension__ __PRETTY_FUNCTION__));
8337	bool PredicateAtRangeStart = Predicate(Range.Start);
8338
8339	for (ElementCount TmpVF = Range.Start * 2;
8340	ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8341	if (Predicate(TmpVF) != PredicateAtRangeStart) {
8342	Range.End = TmpVF;
8343	break;
8344	}
8345
8346	return PredicateAtRangeStart;
8347	}
8348
8349	/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8350	/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8351	/// of VF's starting at a given VF and extending it as much as possible. Each
8352	/// vectorization decision can potentially shorten this sub-range during
8353	/// buildVPlan().
8354	void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8355	ElementCount MaxVF) {
8356	auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8357	for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8358	VFRange SubRange = {VF, MaxVFPlusOne};
8359	VPlans.push_back(buildVPlan(SubRange));
8360	VF = SubRange.End;
8361	}
8362	}
8363
8364	VPValue VPRecipeBuilder::createEdgeMask(BasicBlock Src, BasicBlock *Dst,
8365	VPlanPtr &Plan) {
8366	assert(is_contained(predecessors(Dst), Src) && "Invalid edge")(static_cast <bool> (is_contained(predecessors(Dst), Src ) && "Invalid edge") ? void (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8366, __extension__ __PRETTY_FUNCTION__));
8367
8368	// Look for cached value.
8369	std::pair<BasicBlock , BasicBlock > Edge(Src, Dst);
8370	EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8371	if (ECEntryIt != EdgeMaskCache.end())
8372	return ECEntryIt->second;
8373
8374	VPValue *SrcMask = createBlockInMask(Src, Plan);
8375
8376	// The terminator has to be a branch inst!
8377	BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8378	assert(BI && "Unexpected terminator found")(static_cast <bool> (BI && "Unexpected terminator found" ) ? void (0) : __assert_fail ("BI && \"Unexpected terminator found\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8378, __extension__ __PRETTY_FUNCTION__));
8379
8380	if (!BI->isConditional() \|\| BI->getSuccessor(0) == BI->getSuccessor(1))
8381	return EdgeMaskCache[Edge] = SrcMask;
8382
8383	// If source is an exiting block, we know the exit edge is dynamically dead
8384	// in the vector loop, and thus we don't need to restrict the mask. Avoid
8385	// adding uses of an otherwise potentially dead instruction.
8386	if (OrigLoop->isLoopExiting(Src))
8387	return EdgeMaskCache[Edge] = SrcMask;
8388
8389	VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8390	assert(EdgeMask && "No Edge Mask found for condition")(static_cast <bool> (EdgeMask && "No Edge Mask found for condition" ) ? void (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8390, __extension__ __PRETTY_FUNCTION__));
8391
8392	if (BI->getSuccessor(0) != Dst)
8393	EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8394
8395	if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8396	// The condition is 'SrcMask && EdgeMask', which is equivalent to
8397	// 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8398	// The select version does not introduce new UB if SrcMask is false and
8399	// EdgeMask is poison. Using 'and' here introduces undefined behavior.
8400	VPValue *False = Plan->getOrAddVPValue(
8401	ConstantInt::getFalse(BI->getCondition()->getType()));
8402	EdgeMask =
8403	Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8404	}
8405
8406	return EdgeMaskCache[Edge] = EdgeMask;
8407	}
8408
8409	VPValue VPRecipeBuilder::createBlockInMask(BasicBlock BB, VPlanPtr &Plan) {
8410	assert(OrigLoop->contains(BB) && "Block is not a part of a loop")(static_cast <bool> (OrigLoop->contains(BB) && "Block is not a part of a loop") ? void (0) : __assert_fail ( "OrigLoop->contains(BB) && \"Block is not a part of a loop\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8410, __extension__ __PRETTY_FUNCTION__));
8411
8412	// Look for cached value.
8413	BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8414	if (BCEntryIt != BlockMaskCache.end())
8415	return BCEntryIt->second;
8416
8417	// All-one mask is modelled as no-mask following the convention for masked
8418	// load/store/gather/scatter. Initialize BlockMask to no-mask.
8419	VPValue *BlockMask = nullptr;
8420
8421	if (OrigLoop->getHeader() == BB) {
8422	if (!CM.blockNeedsPredicationForAnyReason(BB))
8423	return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8424
8425	// Introduce the early-exit compare IV <= BTC to form header block mask.
8426	// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8427	// constructing the desired canonical IV in the header block as its first
8428	// non-phi instructions.
8429	assert(CM.foldTailByMasking() && "must fold the tail")(static_cast <bool> (CM.foldTailByMasking() && "must fold the tail" ) ? void (0) : __assert_fail ("CM.foldTailByMasking() && \"must fold the tail\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8429, __extension__ __PRETTY_FUNCTION__));
8430	VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8431	auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8432	auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8433	HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8434
8435	VPBuilder::InsertPointGuard Guard(Builder);
8436	Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8437	if (CM.TTI.emitGetActiveLaneMask()) {
8438	VPValue *TC = Plan->getOrCreateTripCount();
8439	BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8440	} else {
8441	VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8442	BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8443	}
8444	return BlockMaskCache[BB] = BlockMask;
8445	}
8446
8447	// This is the block mask. We OR all incoming edges.
8448	for (auto *Predecessor : predecessors(BB)) {
8449	VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8450	if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8451	return BlockMaskCache[BB] = EdgeMask;
8452
8453	if (!BlockMask) { // BlockMask has its initialized nullptr value.
8454	BlockMask = EdgeMask;
8455	continue;
8456	}
8457
8458	BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8459	}
8460
8461	return BlockMaskCache[BB] = BlockMask;
8462	}
8463
8464	VPRecipeBase VPRecipeBuilder::tryToWidenMemory(Instruction I,
8465	ArrayRef<VPValue *> Operands,
8466	VFRange &Range,
8467	VPlanPtr &Plan) {
8468	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) \|\| isa< StoreInst>(I)) && "Must be called with either a load or store" ) ? void (0) : __assert_fail ("(isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && \"Must be called with either a load or store\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8469, __extension__ __PRETTY_FUNCTION__))
8469	"Must be called with either a load or store")(static_cast <bool> ((isa<LoadInst>(I) \|\| isa< StoreInst>(I)) && "Must be called with either a load or store" ) ? void (0) : __assert_fail ("(isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && \"Must be called with either a load or store\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8469, __extension__ __PRETTY_FUNCTION__));
8470
8471	auto willWiden = [&](ElementCount VF) -> bool {
8472	if (VF.isScalar())
8473	return false;
8474	LoopVectorizationCostModel::InstWidening Decision =
8475	CM.getWideningDecision(I, VF);
8476	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&(static_cast <bool> (Decision != LoopVectorizationCostModel ::CM_Unknown && "CM decision should be taken at this point." ) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8477, __extension__ __PRETTY_FUNCTION__))
8477	"CM decision should be taken at this point.")(static_cast <bool> (Decision != LoopVectorizationCostModel ::CM_Unknown && "CM decision should be taken at this point." ) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8477, __extension__ __PRETTY_FUNCTION__));
8478	if (Decision == LoopVectorizationCostModel::CM_Interleave)
8479	return true;
8480	if (CM.isScalarAfterVectorization(I, VF) \|\|
8481	CM.isProfitableToScalarize(I, VF))
8482	return false;
8483	return Decision != LoopVectorizationCostModel::CM_Scalarize;
8484	};
8485
8486	if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8487	return nullptr;
8488
8489	VPValue *Mask = nullptr;
8490	if (Legal->isMaskRequired(I))
8491	Mask = createBlockInMask(I->getParent(), Plan);
8492
8493	// Determine if the pointer operand of the access is either consecutive or
8494	// reverse consecutive.
8495	LoopVectorizationCostModel::InstWidening Decision =
8496	CM.getWideningDecision(I, Range.Start);
8497	bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8498	bool Consecutive =
8499	Reverse \|\| Decision == LoopVectorizationCostModel::CM_Widen;
8500
8501	if (LoadInst *Load = dyn_cast<LoadInst>(I))
8502	return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8503	Consecutive, Reverse);
8504
8505	StoreInst *Store = cast<StoreInst>(I);
8506	return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8507	Mask, Consecutive, Reverse);
8508	}
8509
8510	static VPWidenIntOrFpInductionRecipe *
8511	createWidenInductionRecipe(PHINode Phi, Instruction PhiOrTrunc,
8512	VPValue *Start, const InductionDescriptor &IndDesc,
8513	LoopVectorizationCostModel &CM, Loop &OrigLoop,
8514	VFRange &Range) {
8515	// Returns true if an instruction \p I should be scalarized instead of
8516	// vectorized for the chosen vectorization factor.
8517	auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8518	return CM.isScalarAfterVectorization(I, VF) \|\|
8519	CM.isProfitableToScalarize(I, VF);
8520	};
8521
8522	bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8523	[&](ElementCount VF) {
8524	// Returns true if we should generate a scalar version of \p IV.
8525	if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8526	return true;
8527	auto isScalarInst = [&](User *U) -> bool {
8528	auto *I = cast<Instruction>(U);
8529	return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8530	};
8531	return any_of(PhiOrTrunc->users(), isScalarInst);
8532	},
8533	Range);
8534	bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8535	[&](ElementCount VF) {
8536	return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8537	},
8538	Range);
8539	assert(IndDesc.getStartValue() ==(static_cast <bool> (IndDesc.getStartValue() == Phi-> getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void (0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8540, __extension__ __PRETTY_FUNCTION__))
8540	Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()))(static_cast <bool> (IndDesc.getStartValue() == Phi-> getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void (0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8540, __extension__ __PRETTY_FUNCTION__));
8541	if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8542	return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
8543	NeedsScalarIV, !NeedsScalarIVOnly);
8544	}
8545	assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here")(static_cast <bool> (isa<PHINode>(PhiOrTrunc) && "must be a phi node here") ? void (0) : __assert_fail ("isa<PHINode>(PhiOrTrunc) && \"must be a phi node here\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8545, __extension__ __PRETTY_FUNCTION__));
8546	return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8547	!NeedsScalarIVOnly);
8548	}
8549
8550	VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8551	PHINode Phi, ArrayRef<VPValue > Operands, VFRange &Range) const {
8552
8553	// Check if this is an integer or fp induction. If so, build the recipe that
8554	// produces its scalar and vector values.
8555	if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8556	return createWidenInductionRecipe(Phi, Phi, Operands[0], II, CM, OrigLoop,
8557	Range);
8558
8559	return nullptr;
8560	}
8561
8562	VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8563	TruncInst I, ArrayRef<VPValue > Operands, VFRange &Range,
8564	VPlan &Plan) const {
8565	// Optimize the special case where the source is a constant integer
8566	// induction variable. Notice that we can only optimize the 'trunc' case
8567	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8568	// (c) other casts depend on pointer size.
8569
8570	// Determine whether \p K is a truncation based on an induction variable that
8571	// can be optimized.
8572	auto isOptimizableIVTruncate =
8573	[&](Instruction *K) -> std::function<bool(ElementCount)> {
8574	return [=](ElementCount VF) -> bool {
8575	return CM.isOptimizableIVTruncate(K, VF);
8576	};
8577	};
8578
8579	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8580	isOptimizableIVTruncate(I), Range)) {
8581
8582	auto *Phi = cast<PHINode>(I->getOperand(0));
8583	const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8584	VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8585	return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
8586	}
8587	return nullptr;
8588	}
8589
8590	VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8591	ArrayRef<VPValue *> Operands,
8592	VPlanPtr &Plan) {
8593	// If all incoming values are equal, the incoming VPValue can be used directly
8594	// instead of creating a new VPBlendRecipe.
8595	VPValue *FirstIncoming = Operands[0];
8596	if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8597	return FirstIncoming == Inc;
8598	})) {
8599	return Operands[0];
8600	}
8601
8602	// We know that all PHIs in non-header blocks are converted into selects, so
8603	// we don't have to worry about the insertion order and we can just use the
8604	// builder. At this point we generate the predication tree. There may be
8605	// duplications since this is a simple recursive scan, but future
8606	// optimizations will clean it up.
8607	SmallVector<VPValue *, 2> OperandsWithMask;
8608	unsigned NumIncoming = Phi->getNumIncomingValues();
8609
8610	for (unsigned In = 0; In < NumIncoming; In++) {
8611	VPValue *EdgeMask =
8612	createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8613	assert((EdgeMask \|\| NumIncoming == 1) &&(static_cast <bool> ((EdgeMask \|\| NumIncoming == 1) && "Multiple predecessors with one having a full mask") ? void ( 0) : __assert_fail ("(EdgeMask \|\| NumIncoming == 1) && \"Multiple predecessors with one having a full mask\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8614, __extension__ __PRETTY_FUNCTION__))
8614	"Multiple predecessors with one having a full mask")(static_cast <bool> ((EdgeMask \|\| NumIncoming == 1) && "Multiple predecessors with one having a full mask") ? void ( 0) : __assert_fail ("(EdgeMask \|\| NumIncoming == 1) && \"Multiple predecessors with one having a full mask\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8614, __extension__ __PRETTY_FUNCTION__));
8615	OperandsWithMask.push_back(Operands[In]);
8616	if (EdgeMask)
8617	OperandsWithMask.push_back(EdgeMask);
8618	}
8619	return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8620	}
8621
8622	VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
8623	ArrayRef<VPValue *> Operands,
8624	VFRange &Range) const {
8625
8626	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8627	[this, CI](ElementCount VF) {
8628	return CM.isScalarWithPredication(CI, VF);
8629	},
8630	Range);
8631
8632	if (IsPredicated)
8633	return nullptr;
8634
8635	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8636	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
8637	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
8638	ID == Intrinsic::pseudoprobe \|\|
8639	ID == Intrinsic::experimental_noalias_scope_decl))
8640	return nullptr;
8641
8642	auto willWiden = [&](ElementCount VF) -> bool {
8643	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8644	// The following case may be scalarized depending on the VF.
8645	// The flag shows whether we use Intrinsic or a usual Call for vectorized
8646	// version of the instruction.
8647	// Is it beneficial to perform intrinsic call compared to lib call?
8648	bool NeedToScalarize = false;
8649	InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8650	InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8651	bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8652	return UseVectorIntrinsic \|\| !NeedToScalarize;
8653	};
8654
8655	if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8656	return nullptr;
8657
8658	ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8659	return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8660	}
8661
8662	bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8663	assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&(static_cast <bool> (!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && "Instruction should have been handled earlier" ) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8664, __extension__ __PRETTY_FUNCTION__))
8664	!isa<StoreInst>(I) && "Instruction should have been handled earlier")(static_cast <bool> (!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && "Instruction should have been handled earlier" ) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8664, __extension__ __PRETTY_FUNCTION__));
8665	// Instruction should be widened, unless it is scalar after vectorization,
8666	// scalarization is profitable or it is predicated.
8667	auto WillScalarize = [this, I](ElementCount VF) -> bool {
8668	return CM.isScalarAfterVectorization(I, VF) \|\|
8669	CM.isProfitableToScalarize(I, VF) \|\|
8670	CM.isScalarWithPredication(I, VF);
8671	};
8672	return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8673	Range);
8674	}
8675
8676	VPWidenRecipe VPRecipeBuilder::tryToWiden(Instruction I,
8677	ArrayRef<VPValue *> Operands) const {
8678	auto IsVectorizableOpcode = [](unsigned Opcode) {
8679	switch (Opcode) {
8680	case Instruction::Add:
8681	case Instruction::And:
8682	case Instruction::AShr:
8683	case Instruction::BitCast:
8684	case Instruction::FAdd:
8685	case Instruction::FCmp:
8686	case Instruction::FDiv:
8687	case Instruction::FMul:
8688	case Instruction::FNeg:
8689	case Instruction::FPExt:
8690	case Instruction::FPToSI:
8691	case Instruction::FPToUI:
8692	case Instruction::FPTrunc:
8693	case Instruction::FRem:
8694	case Instruction::FSub:
8695	case Instruction::ICmp:
8696	case Instruction::IntToPtr:
8697	case Instruction::LShr:
8698	case Instruction::Mul:
8699	case Instruction::Or:
8700	case Instruction::PtrToInt:
8701	case Instruction::SDiv:
8702	case Instruction::Select:
8703	case Instruction::SExt:
8704	case Instruction::Shl:
8705	case Instruction::SIToFP:
8706	case Instruction::SRem:
8707	case Instruction::Sub:
8708	case Instruction::Trunc:
8709	case Instruction::UDiv:
8710	case Instruction::UIToFP:
8711	case Instruction::URem:
8712	case Instruction::Xor:
8713	case Instruction::ZExt:
8714	return true;
8715	}
8716	return false;
8717	};
8718
8719	if (!IsVectorizableOpcode(I->getOpcode()))
8720	return nullptr;
8721
8722	// Success: widen this instruction.
8723	return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8724	}
8725
8726	void VPRecipeBuilder::fixHeaderPhis() {
8727	BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8728	for (VPHeaderPHIRecipe *R : PhisToFix) {
8729	auto *PN = cast<PHINode>(R->getUnderlyingValue());
8730	VPRecipeBase *IncR =
8731	getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8732	R->addOperand(IncR->getVPSingleValue());
8733	}
8734	}
8735
8736	VPBasicBlock *VPRecipeBuilder::handleReplication(
8737	Instruction I, VFRange &Range, VPBasicBlock VPBB,
8738	VPlanPtr &Plan) {
8739	bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8740	[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8741	Range);
8742
8743	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8744	[&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8745	Range);
8746
8747	// Even if the instruction is not marked as uniform, there are certain
8748	// intrinsic calls that can be effectively treated as such, so we check for
8749	// them here. Conservatively, we only do this for scalable vectors, since
8750	// for fixed-width VFs we can always fall back on full scalarization.
8751	if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8752	switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8753	case Intrinsic::assume:
8754	case Intrinsic::lifetime_start:
8755	case Intrinsic::lifetime_end:
8756	// For scalable vectors if one of the operands is variant then we still
8757	// want to mark as uniform, which will generate one instruction for just
8758	// the first lane of the vector. We can't scalarize the call in the same
8759	// way as for fixed-width vectors because we don't know how many lanes
8760	// there are.
8761	//
8762	// The reasons for doing it this way for scalable vectors are:
8763	// 1. For the assume intrinsic generating the instruction for the first
8764	// lane is still be better than not generating any at all. For
8765	// example, the input may be a splat across all lanes.
8766	// 2. For the lifetime start/end intrinsics the pointer operand only
8767	// does anything useful when the input comes from a stack object,
8768	// which suggests it should always be uniform. For non-stack objects
8769	// the effect is to poison the object, which still allows us to
8770	// remove the call.
8771	IsUniform = true;
8772	break;
8773	default:
8774	break;
8775	}
8776	}
8777
8778	auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8779	IsUniform, IsPredicated);
8780	setRecipe(I, Recipe);
8781	Plan->addVPValue(I, Recipe);
8782
8783	// Find if I uses a predicated instruction. If so, it will use its scalar
8784	// value. Avoid hoisting the insert-element which packs the scalar value into
8785	// a vector value, as that happens iff all users use the vector value.
8786	for (VPValue *Op : Recipe->operands()) {
8787	auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8788	if (!PredR)
8789	continue;
8790	auto *RepR =
8791	cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8792	assert(RepR->isPredicated() &&(static_cast <bool> (RepR->isPredicated() && "expected Replicate recipe to be predicated") ? void (0) : __assert_fail ("RepR->isPredicated() && \"expected Replicate recipe to be predicated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8793, __extension__ __PRETTY_FUNCTION__))
8793	"expected Replicate recipe to be predicated")(static_cast <bool> (RepR->isPredicated() && "expected Replicate recipe to be predicated") ? void (0) : __assert_fail ("RepR->isPredicated() && \"expected Replicate recipe to be predicated\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8793, __extension__ __PRETTY_FUNCTION__));
8794	RepR->setAlsoPack(false);
8795	}
8796
8797	// Finalize the recipe for Instr, first if it is not predicated.
8798	if (!IsPredicated) {
8799	LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalarizing:" << I << "\n"; } } while (false);
8800	VPBB->appendRecipe(Recipe);
8801	return VPBB;
8802	}
8803	LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:" << I << "\n"; } } while (false);
8804
8805	VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8806	assert(SingleSucc && "VPBB must have a single successor when handling "(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling " "predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8807, __extension__ __PRETTY_FUNCTION__))
8807	"predicated replication.")(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling " "predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8807, __extension__ __PRETTY_FUNCTION__));
8808	VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8809	// Record predicated instructions for above packing optimizations.
8810	VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8811	VPBlockUtils::insertBlockAfter(Region, VPBB);
8812	auto *RegSucc = new VPBasicBlock();
8813	VPBlockUtils::insertBlockAfter(RegSucc, Region);
8814	VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8815	return RegSucc;
8816	}
8817
8818	VPRegionBlock VPRecipeBuilder::createReplicateRegion(Instruction Instr,
8819	VPRecipeBase *PredRecipe,
8820	VPlanPtr &Plan) {
8821	// Instructions marked for predication are replicated and placed under an
8822	// if-then construct to prevent side-effects.
8823
8824	// Generate recipes to compute the block mask for this region.
8825	VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8826
8827	// Build the triangular if-then region.
8828	std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8829	assert(Instr->getParent() && "Predicated instruction not in any basic block")(static_cast <bool> (Instr->getParent() && "Predicated instruction not in any basic block" ) ? void (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8829, __extension__ __PRETTY_FUNCTION__));
8830	auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8831	auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8832	auto *PHIRecipe = Instr->getType()->isVoidTy()
8833	? nullptr
8834	: new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8835	if (PHIRecipe) {
8836	Plan->removeVPValueFor(Instr);
8837	Plan->addVPValue(Instr, PHIRecipe);
8838	}
8839	auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8840	auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8841	VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8842
8843	// Note: first set Entry as region entry and then connect successors starting
8844	// from it in order, to propagate the "parent" of each VPBasicBlock.
8845	VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8846	VPBlockUtils::connectBlocks(Pred, Exit);
8847
8848	return Region;
8849	}
8850
8851	VPRecipeOrVPValueTy
8852	VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8853	ArrayRef<VPValue *> Operands,
8854	VFRange &Range, VPlanPtr &Plan) {
8855	// First, check for specific widening recipes that deal with calls, memory
8856	// operations, inductions and Phi nodes.
8857	if (auto *CI = dyn_cast<CallInst>(Instr))
8858	return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8859
8860	if (isa<LoadInst>(Instr) \|\| isa<StoreInst>(Instr))
8861	return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8862
8863	VPRecipeBase *Recipe;
8864	if (auto Phi = dyn_cast<PHINode>(Instr)) {
8865	if (Phi->getParent() != OrigLoop->getHeader())
8866	return tryToBlend(Phi, Operands, Plan);
8867	if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8868	return toVPRecipeResult(Recipe);
8869
8870	VPHeaderPHIRecipe *PhiRecipe = nullptr;
8871	if (Legal->isReductionVariable(Phi) \|\| Legal->isFirstOrderRecurrence(Phi)) {
8872	VPValue *StartV = Operands[0];
8873	if (Legal->isReductionVariable(Phi)) {
8874	const RecurrenceDescriptor &RdxDesc =
8875	Legal->getReductionVars().find(Phi)->second;
8876	assert(RdxDesc.getRecurrenceStartValue() ==(static_cast <bool> (RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader ())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8877, __extension__ __PRETTY_FUNCTION__))
8877	Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))(static_cast <bool> (RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader ())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8877, __extension__ __PRETTY_FUNCTION__));
8878	PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8879	CM.isInLoopReduction(Phi),
8880	CM.useOrderedReductions(RdxDesc));
8881	} else {
8882	PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8883	}
8884
8885	// Record the incoming value from the backedge, so we can add the incoming
8886	// value from the backedge after all recipes have been created.
8887	recordRecipeOf(cast<Instruction>(
8888	Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8889	PhisToFix.push_back(PhiRecipe);
8890	} else {
8891	// TODO: record backedge value for remaining pointer induction phis.
8892	assert(Phi->getType()->isPointerTy() &&(static_cast <bool> (Phi->getType()->isPointerTy( ) && "only pointer phis should be handled here") ? void (0) : __assert_fail ("Phi->getType()->isPointerTy() && \"only pointer phis should be handled here\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8893, __extension__ __PRETTY_FUNCTION__))
8893	"only pointer phis should be handled here")(static_cast <bool> (Phi->getType()->isPointerTy( ) && "only pointer phis should be handled here") ? void (0) : __assert_fail ("Phi->getType()->isPointerTy() && \"only pointer phis should be handled here\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8893, __extension__ __PRETTY_FUNCTION__));
8894	assert(Legal->getInductionVars().count(Phi) &&(static_cast <bool> (Legal->getInductionVars().count (Phi) && "Not an induction variable") ? void (0) : __assert_fail ("Legal->getInductionVars().count(Phi) && \"Not an induction variable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8895, __extension__ __PRETTY_FUNCTION__))
8895	"Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count (Phi) && "Not an induction variable") ? void (0) : __assert_fail ("Legal->getInductionVars().count(Phi) && \"Not an induction variable\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8895, __extension__ __PRETTY_FUNCTION__));
8896	InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8897	VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8898	PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8899	}
8900
8901	return toVPRecipeResult(PhiRecipe);
8902	}
8903
8904	if (isa<TruncInst>(Instr) &&
8905	(Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8906	Range, *Plan)))
8907	return toVPRecipeResult(Recipe);
8908
8909	if (!shouldWiden(Instr, Range))
8910	return nullptr;
8911
8912	if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8913	return toVPRecipeResult(new VPWidenGEPRecipe(
8914	GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8915
8916	if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8917	bool InvariantCond =
8918	PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8919	return toVPRecipeResult(new VPWidenSelectRecipe(
8920	*SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8921	}
8922
8923	return toVPRecipeResult(tryToWiden(Instr, Operands));
8924	}
8925
8926	void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8927	ElementCount MaxVF) {
8928	assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() && "Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8928, __extension__ __PRETTY_FUNCTION__));
8929
8930	// Collect instructions from the original loop that will become trivially dead
8931	// in the vectorized loop. We don't need to vectorize these instructions. For
8932	// example, original induction update instructions can become dead because we
8933	// separately emit induction "steps" when generating code for the new loop.
8934	// Similarly, we create a new latch condition when setting up the structure
8935	// of the new loop, so the old one can become dead.
8936	SmallPtrSet<Instruction *, 4> DeadInstructions;
8937	collectTriviallyDeadInstructions(DeadInstructions);
8938
8939	// Add assume instructions we need to drop to DeadInstructions, to prevent
8940	// them from being added to the VPlan.
8941	// TODO: We only need to drop assumes in blocks that get flattend. If the
8942	// control flow is preserved, we should keep them.
8943	auto &ConditionalAssumes = Legal->getConditionalAssumes();
8944	DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8945
8946	MapVector<Instruction , Instruction > &SinkAfter = Legal->getSinkAfter();
8947	// Dead instructions do not need sinking. Remove them from SinkAfter.
8948	for (Instruction *I : DeadInstructions)
8949	SinkAfter.erase(I);
8950
8951	// Cannot sink instructions after dead instructions (there won't be any
8952	// recipes for them). Instead, find the first non-dead previous instruction.
8953	for (auto &P : Legal->getSinkAfter()) {
8954	Instruction *SinkTarget = P.second;
8955	Instruction FirstInst = &SinkTarget->getParent()->begin();
8956	(void)FirstInst;
8957	while (DeadInstructions.contains(SinkTarget)) {
8958	assert((static_cast <bool> (SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " "first-order recurrence PHI) before reaching beginning of the block" ) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"first-order recurrence PHI) before reaching beginning of the block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8961, __extension__ __PRETTY_FUNCTION__))
8959	SinkTarget != FirstInst &&(static_cast <bool> (SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " "first-order recurrence PHI) before reaching beginning of the block" ) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"first-order recurrence PHI) before reaching beginning of the block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8961, __extension__ __PRETTY_FUNCTION__))
8960	"Must find a live instruction (at least the one feeding the "(static_cast <bool> (SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " "first-order recurrence PHI) before reaching beginning of the block" ) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"first-order recurrence PHI) before reaching beginning of the block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8961, __extension__ __PRETTY_FUNCTION__))
8961	"first-order recurrence PHI) before reaching beginning of the block")(static_cast <bool> (SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " "first-order recurrence PHI) before reaching beginning of the block" ) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"first-order recurrence PHI) before reaching beginning of the block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8961, __extension__ __PRETTY_FUNCTION__));
8962	SinkTarget = SinkTarget->getPrevNode();
8963	assert(SinkTarget != P.first &&(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required" ) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8964, __extension__ __PRETTY_FUNCTION__))
8964	"sink source equals target, no sinking required")(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required" ) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8964, __extension__ __PRETTY_FUNCTION__));
8965	}
8966	P.second = SinkTarget;
8967	}
8968
8969	auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8970	for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8971	VFRange SubRange = {VF, MaxVFPlusOne};
8972	VPlans.push_back(
8973	buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8974	VF = SubRange.End;
8975	}
8976	}
8977
8978	// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8979	// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8980	// BranchOnCount VPInstruction to the latch.
8981	static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8982	bool HasNUW, bool IsVPlanNative) {
8983	Value *StartIdx = ConstantInt::get(IdxTy, 0);
8984	auto *StartV = Plan.getOrAddVPValue(StartIdx);
8985
8986	auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8987	VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8988	VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8989	if (IsVPlanNative)
8990	Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8991	Header->insert(CanonicalIVPHI, Header->begin());
8992
8993	auto *CanonicalIVIncrement =
8994	new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8995	: VPInstruction::CanonicalIVIncrement,
8996	{CanonicalIVPHI}, DL);
8997	CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8998
8999	VPBasicBlock *EB = TopRegion->getExitBasicBlock();
9000	if (IsVPlanNative) {
9001	EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
9002	EB->setCondBit(nullptr);
9003	}
9004	EB->appendRecipe(CanonicalIVIncrement);
9005
9006	auto *BranchOnCount =
9007	new VPInstruction(VPInstruction::BranchOnCount,
9008	{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9009	EB->appendRecipe(BranchOnCount);
9010	}
9011
9012	VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9013	VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9014	const MapVector<Instruction , Instruction > &SinkAfter) {
9015
9016	SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9017
9018	VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9019
9020	// ---------------------------------------------------------------------------
9021	// Pre-construction: record ingredients whose recipes we'll need to further
9022	// process after constructing the initial VPlan.
9023	// ---------------------------------------------------------------------------
9024
9025	// Mark instructions we'll need to sink later and their targets as
9026	// ingredients whose recipe we'll need to record.
9027	for (auto &Entry : SinkAfter) {
9028	RecipeBuilder.recordRecipeOf(Entry.first);
9029	RecipeBuilder.recordRecipeOf(Entry.second);
9030	}
9031	for (auto &Reduction : CM.getInLoopReductionChains()) {
9032	PHINode *Phi = Reduction.first;
9033	RecurKind Kind =
9034	Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
9035	const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9036
9037	RecipeBuilder.recordRecipeOf(Phi);
9038	for (auto &R : ReductionOperations) {
9039	RecipeBuilder.recordRecipeOf(R);
9040	// For min/max reducitons, where we have a pair of icmp/select, we also
9041	// need to record the ICmp recipe, so it can be removed later.
9042	assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind (Kind) && "Only min/max recurrences allowed for inloop reductions" ) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9043, __extension__ __PRETTY_FUNCTION__))
9043	"Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind (Kind) && "Only min/max recurrences allowed for inloop reductions" ) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9043, __extension__ __PRETTY_FUNCTION__));
9044	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9045	RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9046	}
9047	}
9048
9049	// For each interleave group which is relevant for this (possibly trimmed)
9050	// Range, add it to the set of groups to be later applied to the VPlan and add
9051	// placeholders for its members' Recipes which we'll be replacing with a
9052	// single VPInterleaveRecipe.
9053	for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9054	auto applyIG = [IG, this](ElementCount VF) -> bool {
9055	return (VF.isVector() && // Query is illegal for VF == 1
9056	CM.getWideningDecision(IG->getInsertPos(), VF) ==
9057	LoopVectorizationCostModel::CM_Interleave);
9058	};
9059	if (!getDecisionAndClampRange(applyIG, Range))
9060	continue;
9061	InterleaveGroups.insert(IG);
9062	for (unsigned i = 0; i < IG->getFactor(); i++)
9063	if (Instruction *Member = IG->getMember(i))
9064	RecipeBuilder.recordRecipeOf(Member);
9065	};
9066
9067	// ---------------------------------------------------------------------------
9068	// Build initial VPlan: Scan the body of the loop in a topological order to
9069	// visit each basic block after having visited its predecessor basic blocks.
9070	// ---------------------------------------------------------------------------
9071
9072	// Create initial VPlan skeleton, with separate header and latch blocks.
9073	VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9074	VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9075	VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9076	auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9077	auto Plan = std::make_unique<VPlan>(TopRegion);
9078
9079	Instruction *DLInst =
9080	getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9081	addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
9082	DLInst ? DLInst->getDebugLoc() : DebugLoc(),
9083	!CM.foldTailByMasking(), false);
9084
9085	// Scan the body of the loop in a topological order to visit each basic block
9086	// after having visited its predecessor basic blocks.
9087	LoopBlocksDFS DFS(OrigLoop);
9088	DFS.perform(LI);
9089
9090	VPBasicBlock *VPBB = HeaderVPBB;
9091	SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9092	for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9093	// Relevant instructions from basic block BB will be grouped into VPRecipe
9094	// ingredients and fill a new VPBasicBlock.
9095	unsigned VPBBsForBB = 0;
9096	VPBB->setName(BB->getName());
9097	Builder.setInsertPoint(VPBB);
9098
9099	// Introduce each ingredient into VPlan.
9100	// TODO: Model and preserve debug instrinsics in VPlan.
9101	for (Instruction &I : BB->instructionsWithoutDebug()) {
9102	Instruction *Instr = &I;
9103
9104	// First filter out irrelevant instructions, to ensure no recipes are
9105	// built for them.
9106	if (isa<BranchInst>(Instr) \|\| DeadInstructions.count(Instr))
9107	continue;
9108
9109	SmallVector<VPValue *, 4> Operands;
9110	auto *Phi = dyn_cast<PHINode>(Instr);
9111	if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9112	Operands.push_back(Plan->getOrAddVPValue(
9113	Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9114	} else {
9115	auto OpRange = Plan->mapToVPValues(Instr->operands());
9116	Operands = {OpRange.begin(), OpRange.end()};
9117	}
9118	if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9119	Instr, Operands, Range, Plan)) {
9120	// If Instr can be simplified to an existing VPValue, use it.
9121	if (RecipeOrValue.is<VPValue *>()) {
9122	auto VPV = RecipeOrValue.get<VPValue >();
9123	Plan->addVPValue(Instr, VPV);
9124	// If the re-used value is a recipe, register the recipe for the
9125	// instruction, in case the recipe for Instr needs to be recorded.
9126	if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9127	RecipeBuilder.setRecipe(Instr, R);
9128	continue;
9129	}
9130	// Otherwise, add the new recipe.
9131	VPRecipeBase Recipe = RecipeOrValue.get<VPRecipeBase >();
9132	for (auto *Def : Recipe->definedValues()) {
9133	auto *UV = Def->getUnderlyingValue();
9134	Plan->addVPValue(UV, Def);
9135	}
9136
9137	if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9138	HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9139	// Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9140	// of the header block. That can happen for truncates of induction
9141	// variables. Those recipes are moved to the phi section of the header
9142	// block after applying SinkAfter, which relies on the original
9143	// position of the trunc.
9144	assert(isa<TruncInst>(Instr))(static_cast <bool> (isa<TruncInst>(Instr)) ? void (0) : __assert_fail ("isa<TruncInst>(Instr)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 9144, __extension__ __PRETTY_FUNCTION__));
9145	InductionsToMove.push_back(
9146	cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9147	}
9148	RecipeBuilder.setRecipe(Instr, Recipe);
9149	VPBB->appendRecipe(Recipe);
9150	continue;
9151	}
9152
9153	// Otherwise, if all widening options failed, Instruction is to be
9154	// replicated. This may create a successor for VPBB.
9155	VPBasicBlock *NextVPBB =
9156	RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9157	if (NextVPBB != VPBB) {
9158	VPBB = NextVPBB;
9159	VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9160	: "");
9161	}
9162	}
9163
9164	VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9165	VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9166	}
9167
9168	// Fold the last, empty block into its predecessor.
9169	VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9170	assert(VPBB && "expected to fold last (empty) block")(static_cast <bool> (VPBB && "expected to fold last (empty) block" ) ? void (0) : __assert_fail ("VPBB && \"expected to fold last (empty) block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9170, __extension__ __PRETTY_FUNCTION__));
9171	// After here, VPBB should not be used.
9172	VPBB = nullptr;
9173
9174	assert(isa<VPRegionBlock>(Plan->getEntry()) &&(static_cast <bool> (isa<VPRegionBlock>(Plan-> getEntry()) && !Plan->getEntry()->getEntryBasicBlock ()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9177, __extension__ __PRETTY_FUNCTION__))
9175	!Plan->getEntry()->getEntryBasicBlock()->empty() &&(static_cast <bool> (isa<VPRegionBlock>(Plan-> getEntry()) && !Plan->getEntry()->getEntryBasicBlock ()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9177, __extension__ __PRETTY_FUNCTION__))
9176	"entry block must be set to a VPRegionBlock having a non-empty entry "(static_cast <bool> (isa<VPRegionBlock>(Plan-> getEntry()) && !Plan->getEntry()->getEntryBasicBlock ()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9177, __extension__ __PRETTY_FUNCTION__))
9177	"VPBasicBlock")(static_cast <bool> (isa<VPRegionBlock>(Plan-> getEntry()) && !Plan->getEntry()->getEntryBasicBlock ()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9177, __extension__ __PRETTY_FUNCTION__));
9178	RecipeBuilder.fixHeaderPhis();
9179
9180	// ---------------------------------------------------------------------------
9181	// Transform initial VPlan: Apply previously taken decisions, in order, to
9182	// bring the VPlan to its final state.
9183	// ---------------------------------------------------------------------------
9184
9185	// Apply Sink-After legal constraints.
9186	auto GetReplicateRegion = [](VPRecipeBase R) -> VPRegionBlock {
9187	auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9188	if (Region && Region->isReplicator()) {
9189	assert(Region->getNumSuccessors() == 1 &&(static_cast <bool> (Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!" ) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9190, __extension__ __PRETTY_FUNCTION__))
9190	Region->getNumPredecessors() == 1 && "Expected SESE region!")(static_cast <bool> (Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!" ) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9190, __extension__ __PRETTY_FUNCTION__));
9191	assert(R->getParent()->size() == 1 &&(static_cast <bool> (R->getParent()->size() == 1 && "A recipe in an original replicator region must be the only " "recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9193, __extension__ __PRETTY_FUNCTION__))
9192	"A recipe in an original replicator region must be the only "(static_cast <bool> (R->getParent()->size() == 1 && "A recipe in an original replicator region must be the only " "recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9193, __extension__ __PRETTY_FUNCTION__))
9193	"recipe in its block")(static_cast <bool> (R->getParent()->size() == 1 && "A recipe in an original replicator region must be the only " "recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9193, __extension__ __PRETTY_FUNCTION__));
9194	return Region;
9195	}
9196	return nullptr;
9197	};
9198	for (auto &Entry : SinkAfter) {
9199	VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9200	VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9201
9202	auto *TargetRegion = GetReplicateRegion(Target);
9203	auto *SinkRegion = GetReplicateRegion(Sink);
9204	if (!SinkRegion) {
9205	// If the sink source is not a replicate region, sink the recipe directly.
9206	if (TargetRegion) {
9207	// The target is in a replication region, make sure to move Sink to
9208	// the block after it, not into the replication region itself.
9209	VPBasicBlock *NextBlock =
9210	cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9211	Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9212	} else
9213	Sink->moveAfter(Target);
9214	continue;
9215	}
9216
9217	// The sink source is in a replicate region. Unhook the region from the CFG.
9218	auto *SinkPred = SinkRegion->getSinglePredecessor();
9219	auto *SinkSucc = SinkRegion->getSingleSuccessor();
9220	VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9221	VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9222	VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9223
9224	if (TargetRegion) {
9225	// The target recipe is also in a replicate region, move the sink region
9226	// after the target region.
9227	auto *TargetSucc = TargetRegion->getSingleSuccessor();
9228	VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9229	VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9230	VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9231	} else {
9232	// The sink source is in a replicate region, we need to move the whole
9233	// replicate region, which should only contain a single recipe in the
9234	// main block.
9235	auto *SplitBlock =
9236	Target->getParent()->splitAt(std::next(Target->getIterator()));
9237
9238	auto *SplitPred = SplitBlock->getSinglePredecessor();
9239
9240	VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9241	VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9242	VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9243	}
9244	}
9245
9246	VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9247	VPlanTransforms::removeRedundantInductionCasts(*Plan);
9248
9249	// Now that sink-after is done, move induction recipes for optimized truncates
9250	// to the phi section of the header block.
9251	for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9252	Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9253
9254	// Adjust the recipes for any inloop reductions.
9255	adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9256	RecipeBuilder, Range.Start);
9257
9258	// Introduce a recipe to combine the incoming and previous values of a
9259	// first-order recurrence.
9260	for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9261	auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9262	if (!RecurPhi)
9263	continue;
9264
9265	VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9266	VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9267	auto *Region = GetReplicateRegion(PrevRecipe);
9268	if (Region)
9269	InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9270	if (Region \|\| PrevRecipe->isPhi())
9271	Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9272	else
9273	Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9274
9275	auto *RecurSplice = cast<VPInstruction>(
9276	Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9277	{RecurPhi, RecurPhi->getBackedgeValue()}));
9278
9279	RecurPhi->replaceAllUsesWith(RecurSplice);
9280	// Set the first operand of RecurSplice to RecurPhi again, after replacing
9281	// all users.
9282	RecurSplice->setOperand(0, RecurPhi);
9283	}
9284
9285	// Interleave memory: for each Interleave Group we marked earlier as relevant
9286	// for this VPlan, replace the Recipes widening its memory instructions with a
9287	// single VPInterleaveRecipe at its insertion point.
9288	for (auto IG : InterleaveGroups) {
9289	auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9290	RecipeBuilder.getRecipe(IG->getInsertPos()));
9291	SmallVector<VPValue *, 4> StoredValues;
9292	for (unsigned i = 0; i < IG->getFactor(); ++i)
9293	if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9294	auto *StoreR =
9295	cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9296	StoredValues.push_back(StoreR->getStoredValue());
9297	}
9298
9299	auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9300	Recipe->getMask());
9301	VPIG->insertBefore(Recipe);
9302	unsigned J = 0;
9303	for (unsigned i = 0; i < IG->getFactor(); ++i)
9304	if (Instruction *Member = IG->getMember(i)) {
9305	if (!Member->getType()->isVoidTy()) {
9306	VPValue *OriginalV = Plan->getVPValue(Member);
9307	Plan->removeVPValueFor(Member);
9308	Plan->addVPValue(Member, VPIG->getVPValue(J));
9309	OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9310	J++;
9311	}
9312	RecipeBuilder.getRecipe(Member)->eraseFromParent();
9313	}
9314	}
9315
9316	// From this point onwards, VPlan-to-VPlan transformations may change the plan
9317	// in ways that accessing values using original IR values is incorrect.
9318	Plan->disableValue2VPValue();
9319
9320	VPlanTransforms::sinkScalarOperands(*Plan);
9321	VPlanTransforms::mergeReplicateRegions(*Plan);
9322
9323	std::string PlanName;
9324	raw_string_ostream RSO(PlanName);
9325	ElementCount VF = Range.Start;
9326	Plan->addVF(VF);
9327	RSO << "Initial VPlan for VF={" << VF;
9328	for (VF = 2; ElementCount::isKnownLT(VF, Range.End); VF = 2) {
9329	Plan->addVF(VF);
9330	RSO << "," << VF;
9331	}
9332	RSO << "},UF>=1";
9333	RSO.flush();
9334	Plan->setName(PlanName);
9335
9336	// Fold Exit block into its predecessor if possible.
9337	// TODO: Fold block earlier once all VPlan transforms properly maintain a
9338	// VPBasicBlock as exit.
9339	VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9340
9341	assert(VPlanVerifier::verifyPlanIsValid(Plan) && "VPlan is invalid")(static_cast <bool> (VPlanVerifier::verifyPlanIsValid( Plan) && "VPlan is invalid") ? void (0) : __assert_fail ("VPlanVerifier::verifyPlanIsValid(*Plan) && \"VPlan is invalid\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9341, __extension__ __PRETTY_FUNCTION__));
9342	return Plan;
9343	}
9344
9345	VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9346	// Outer loop handling: They may require CFG and instruction level
9347	// transformations before even evaluating whether vectorization is profitable.
9348	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
9349	// the vectorization pipeline.
9350	assert(!OrigLoop->isInnermost())(static_cast <bool> (!OrigLoop->isInnermost()) ? void (0) : __assert_fail ("!OrigLoop->isInnermost()", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 9350, __extension__ __PRETTY_FUNCTION__));
9351	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled." ) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9351, __extension__ __PRETTY_FUNCTION__));
9352
9353	// Create new empty VPlan
9354	auto Plan = std::make_unique<VPlan>();
9355
9356	// Build hierarchical CFG
9357	VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9358	HCFGBuilder.buildHierarchicalCFG();
9359
9360	for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9361	VF *= 2)
9362	Plan->addVF(VF);
9363
9364	if (EnableVPlanPredication) {
9365	VPlanPredicator VPP(*Plan);
9366	VPP.predicate();
9367
9368	// Avoid running transformation to recipes until masked code generation in
9369	// VPlan-native path is in place.
9370	return Plan;
9371	}
9372
9373	SmallPtrSet<Instruction *, 1> DeadInstructions;
9374	VPlanTransforms::VPInstructionsToVPRecipes(
9375	OrigLoop, Plan,
9376	[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9377	DeadInstructions, *PSE.getSE());
9378
9379	addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9380	true, true);
9381	return Plan;
9382	}
9383
9384	// Adjust the recipes for reductions. For in-loop reductions the chain of
9385	// instructions leading from the loop exit instr to the phi need to be converted
9386	// to reductions, with one operand being vector and the other being the scalar
9387	// reduction chain. For other reductions, a select is introduced between the phi
9388	// and live-out recipes when folding the tail.
9389	void LoopVectorizationPlanner::adjustRecipesForReductions(
9390	VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9391	ElementCount MinVF) {
9392	for (auto &Reduction : CM.getInLoopReductionChains()) {
9393	PHINode *Phi = Reduction.first;
9394	const RecurrenceDescriptor &RdxDesc =
9395	Legal->getReductionVars().find(Phi)->second;
9396	const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9397
9398	if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9399	continue;
9400
9401	// ReductionOperations are orders top-down from the phi's use to the
9402	// LoopExitValue. We keep a track of the previous item (the Chain) to tell
9403	// which of the two operands will remain scalar and which will be reduced.
9404	// For minmax the chain will be the select instructions.
9405	Instruction *Chain = Phi;
9406	for (Instruction *R : ReductionOperations) {
9407	VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9408	RecurKind Kind = RdxDesc.getRecurrenceKind();
9409
9410	VPValue *ChainOp = Plan->getVPValue(Chain);
9411	unsigned FirstOpId;
9412	assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind (Kind) && "Only min/max recurrences allowed for inloop reductions" ) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9413, __extension__ __PRETTY_FUNCTION__))
9413	"Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind (Kind) && "Only min/max recurrences allowed for inloop reductions" ) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9413, __extension__ __PRETTY_FUNCTION__));
9414	// Recognize a call to the llvm.fmuladd intrinsic.
9415	bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9416	assert((!IsFMulAdd \|\| RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&(static_cast <bool> ((!IsFMulAdd \|\| RecurrenceDescriptor ::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic" ) ? void (0) : __assert_fail ("(!IsFMulAdd \|\| RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9417, __extension__ __PRETTY_FUNCTION__))
9417	"Expected instruction to be a call to the llvm.fmuladd intrinsic")(static_cast <bool> ((!IsFMulAdd \|\| RecurrenceDescriptor ::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic" ) ? void (0) : __assert_fail ("(!IsFMulAdd \|\| RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9417, __extension__ __PRETTY_FUNCTION__));
9418	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9419	assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe ) && "Expected to replace a VPWidenSelectSC") ? void ( 0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9420, __extension__ __PRETTY_FUNCTION__))
9420	"Expected to replace a VPWidenSelectSC")(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe ) && "Expected to replace a VPWidenSelectSC") ? void ( 0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9420, __extension__ __PRETTY_FUNCTION__));
9421	FirstOpId = 1;
9422	} else {
9423	assert((MinVF.isScalar() \|\| isa<VPWidenRecipe>(WidenRecipe) \|\|(static_cast <bool> ((MinVF.isScalar() \|\| isa<VPWidenRecipe >(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe >(WidenRecipe))) && "Expected to replace a VPWidenSC" ) ? void (0) : __assert_fail ("(MinVF.isScalar() \|\| isa<VPWidenRecipe>(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9425, __extension__ __PRETTY_FUNCTION__))
9424	(IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&(static_cast <bool> ((MinVF.isScalar() \|\| isa<VPWidenRecipe >(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe >(WidenRecipe))) && "Expected to replace a VPWidenSC" ) ? void (0) : __assert_fail ("(MinVF.isScalar() \|\| isa<VPWidenRecipe>(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9425, __extension__ __PRETTY_FUNCTION__))
9425	"Expected to replace a VPWidenSC")(static_cast <bool> ((MinVF.isScalar() \|\| isa<VPWidenRecipe >(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe >(WidenRecipe))) && "Expected to replace a VPWidenSC" ) ? void (0) : __assert_fail ("(MinVF.isScalar() \|\| isa<VPWidenRecipe>(WidenRecipe) \|\| (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9425, __extension__ __PRETTY_FUNCTION__));
9426	FirstOpId = 0;
9427	}
9428	unsigned VecOpId =
9429	R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9430	VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9431
9432	auto *CondOp = CM.foldTailByMasking()
9433	? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9434	: nullptr;
9435
9436	if (IsFMulAdd) {
9437	// If the instruction is a call to the llvm.fmuladd intrinsic then we
9438	// need to create an fmul recipe to use as the vector operand for the
9439	// fadd reduction.
9440	VPInstruction *FMulRecipe = new VPInstruction(
9441	Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9442	FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9443	WidenRecipe->getParent()->insert(FMulRecipe,
9444	WidenRecipe->getIterator());
9445	VecOp = FMulRecipe;
9446	}
9447	VPReductionRecipe *RedRecipe =
9448	new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9449	WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9450	Plan->removeVPValueFor(R);
9451	Plan->addVPValue(R, RedRecipe);
9452	WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9453	WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9454	WidenRecipe->eraseFromParent();
9455
9456	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9457	VPRecipeBase *CompareRecipe =
9458	RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9459	assert(isa<VPWidenRecipe>(CompareRecipe) &&(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe ) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail ("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9460, __extension__ __PRETTY_FUNCTION__))
9460	"Expected to replace a VPWidenSC")(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe ) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail ("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9460, __extension__ __PRETTY_FUNCTION__));
9461	assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe )->getNumUsers() == 0 && "Expected no remaining users" ) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9462, __extension__ __PRETTY_FUNCTION__))
9462	"Expected no remaining users")(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe )->getNumUsers() == 0 && "Expected no remaining users" ) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9462, __extension__ __PRETTY_FUNCTION__));
9463	CompareRecipe->eraseFromParent();
9464	}
9465	Chain = R;
9466	}
9467	}
9468
9469	// If tail is folded by masking, introduce selects between the phi
9470	// and the live-out instruction of each reduction, at the beginning of the
9471	// dedicated latch block.
9472	if (CM.foldTailByMasking()) {
9473	Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9474	for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9475	VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9476	if (!PhiR \|\| PhiR->isInLoop())
9477	continue;
9478	VPValue *Cond =
9479	RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9480	VPValue *Red = PhiR->getBackedgeValue();
9481	assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&(static_cast <bool> (cast<VPRecipeBase>(Red->getDef ())->getParent() != LatchVPBB && "reduction recipe must be defined before latch" ) ? void (0) : __assert_fail ("cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9482, __extension__ __PRETTY_FUNCTION__))
9482	"reduction recipe must be defined before latch")(static_cast <bool> (cast<VPRecipeBase>(Red->getDef ())->getParent() != LatchVPBB && "reduction recipe must be defined before latch" ) ? void (0) : __assert_fail ("cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9482, __extension__ __PRETTY_FUNCTION__));
9483	Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9484	}
9485	}
9486	}
9487
9488	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
9489	void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9490	VPSlotTracker &SlotTracker) const {
9491	O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9492	IG->getInsertPos()->printAsOperand(O, false);
9493	O << ", ";
9494	getAddr()->printAsOperand(O, SlotTracker);
9495	VPValue *Mask = getMask();
9496	if (Mask) {
9497	O << ", ";
9498	Mask->printAsOperand(O, SlotTracker);
9499	}
9500
9501	unsigned OpIdx = 0;
9502	for (unsigned i = 0; i < IG->getFactor(); ++i) {
9503	if (!IG->getMember(i))
9504	continue;
9505	if (getNumStoreOperands() > 0) {
9506	O << "\n" << Indent << " store ";
9507	getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9508	O << " to index " << i;
9509	} else {
9510	O << "\n" << Indent << " ";
9511	getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9512	O << " = load from index " << i;
9513	}
9514	++OpIdx;
9515	}
9516	}
9517	#endif
9518
9519	void VPWidenCallRecipe::execute(VPTransformState &State) {
9520	State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9521	*this, State);
9522	}
9523
9524	void VPWidenSelectRecipe::execute(VPTransformState &State) {
9525	auto &I = *cast<SelectInst>(getUnderlyingInstr());
9526	State.ILV->setDebugLocFromInst(&I);
9527
9528	// The condition can be loop invariant but still defined inside the
9529	// loop. This means that we can't just use the original 'cond' value.
9530	// We have to take the 'vectorized' value and pick the first lane.
9531	// Instcombine will make this a no-op.
9532	auto *InvarCond =
9533	InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9534
9535	for (unsigned Part = 0; Part < State.UF; ++Part) {
9536	Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9537	Value *Op0 = State.get(getOperand(1), Part);
9538	Value *Op1 = State.get(getOperand(2), Part);
9539	Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9540	State.set(this, Sel, Part);
9541	State.ILV->addMetadata(Sel, &I);
9542	}
9543	}
9544
9545	void VPWidenRecipe::execute(VPTransformState &State) {
9546	auto &I = *cast<Instruction>(getUnderlyingValue());
9547	auto &Builder = State.Builder;
9548	switch (I.getOpcode()) {
9549	case Instruction::Call:
9550	case Instruction::Br:
9551	case Instruction::PHI:
9552	case Instruction::GetElementPtr:
9553	case Instruction::Select:
9554	llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe." , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9554);
9555	case Instruction::UDiv:
9556	case Instruction::SDiv:
9557	case Instruction::SRem:
9558	case Instruction::URem:
9559	case Instruction::Add:
9560	case Instruction::FAdd:
9561	case Instruction::Sub:
9562	case Instruction::FSub:
9563	case Instruction::FNeg:
9564	case Instruction::Mul:
9565	case Instruction::FMul:
9566	case Instruction::FDiv:
9567	case Instruction::FRem:
9568	case Instruction::Shl:
9569	case Instruction::LShr:
9570	case Instruction::AShr:
9571	case Instruction::And:
9572	case Instruction::Or:
9573	case Instruction::Xor: {
9574	// Just widen unops and binops.
9575	State.ILV->setDebugLocFromInst(&I);
9576
9577	for (unsigned Part = 0; Part < State.UF; ++Part) {
9578	SmallVector<Value *, 2> Ops;
9579	for (VPValue *VPOp : operands())
9580	Ops.push_back(State.get(VPOp, Part));
9581
9582	Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9583
9584	if (auto *VecOp = dyn_cast<Instruction>(V)) {
9585	VecOp->copyIRFlags(&I);
9586
9587	// If the instruction is vectorized and was in a basic block that needed
9588	// predication, we can't propagate poison-generating flags (nuw/nsw,
9589	// exact, etc.). The control flow has been linearized and the
9590	// instruction is no longer guarded by the predicate, which could make
9591	// the flag properties to no longer hold.
9592	if (State.MayGeneratePoisonRecipes.contains(this))
9593	VecOp->dropPoisonGeneratingFlags();
9594	}
9595
9596	// Use this vector value for all users of the original instruction.
9597	State.set(this, V, Part);
9598	State.ILV->addMetadata(V, &I);
9599	}
9600
9601	break;
9602	}
9603	case Instruction::ICmp:
9604	case Instruction::FCmp: {
9605	// Widen compares. Generate vector compares.
9606	bool FCmp = (I.getOpcode() == Instruction::FCmp);
9607	auto *Cmp = cast<CmpInst>(&I);
9608	State.ILV->setDebugLocFromInst(Cmp);
9609	for (unsigned Part = 0; Part < State.UF; ++Part) {
9610	Value *A = State.get(getOperand(0), Part);
9611	Value *B = State.get(getOperand(1), Part);
9612	Value *C = nullptr;
9613	if (FCmp) {
9614	// Propagate fast math flags.
9615	IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9616	Builder.setFastMathFlags(Cmp->getFastMathFlags());
9617	C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9618	} else {
9619	C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9620	}
9621	State.set(this, C, Part);
9622	State.ILV->addMetadata(C, &I);
9623	}
9624
9625	break;
9626	}
9627
9628	case Instruction::ZExt:
9629	case Instruction::SExt:
9630	case Instruction::FPToUI:
9631	case Instruction::FPToSI:
9632	case Instruction::FPExt:
9633	case Instruction::PtrToInt:
9634	case Instruction::IntToPtr:
9635	case Instruction::SIToFP:
9636	case Instruction::UIToFP:
9637	case Instruction::Trunc:
9638	case Instruction::FPTrunc:
9639	case Instruction::BitCast: {
9640	auto *CI = cast<CastInst>(&I);
9641	State.ILV->setDebugLocFromInst(CI);
9642
9643	/// Vectorize casts.
9644	Type *DestTy = (State.VF.isScalar())
9645	? CI->getType()
9646	: VectorType::get(CI->getType(), State.VF);
9647
9648	for (unsigned Part = 0; Part < State.UF; ++Part) {
9649	Value *A = State.get(getOperand(0), Part);
9650	Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9651	State.set(this, Cast, Part);
9652	State.ILV->addMetadata(Cast, &I);
9653	}
9654	break;
9655	}
9656	default:
9657	// This instruction is not vectorized by simple widening.
9658	LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: " << I; } } while (false);
9659	llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp" , 9659);
9660	} // end of switch.
9661	}
9662
9663	void VPWidenGEPRecipe::execute(VPTransformState &State) {
9664	auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9665	// Construct a vector GEP by widening the operands of the scalar GEP as
9666	// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9667	// results in a vector of pointers when at least one operand of the GEP
9668	// is vector-typed. Thus, to keep the representation compact, we only use
9669	// vector-typed operands for loop-varying values.
9670
9671	if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9672	// If we are vectorizing, but the GEP has only loop-invariant operands,
9673	// the GEP we build (by only using vector-typed operands for
9674	// loop-varying values) would be a scalar pointer. Thus, to ensure we
9675	// produce a vector of pointers, we need to either arbitrarily pick an
9676	// operand to broadcast, or broadcast a clone of the original GEP.
9677	// Here, we broadcast a clone of the original.
9678	//
9679	// TODO: If at some point we decide to scalarize instructions having
9680	// loop-invariant operands, this special case will no longer be
9681	// required. We would add the scalarization decision to
9682	// collectLoopScalars() and teach getVectorValue() to broadcast
9683	// the lane-zero scalar value.
9684	auto *Clone = State.Builder.Insert(GEP->clone());
9685	for (unsigned Part = 0; Part < State.UF; ++Part) {
9686	Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9687	State.set(this, EntryPart, Part);
9688	State.ILV->addMetadata(EntryPart, GEP);
9689	}
9690	} else {
9691	// If the GEP has at least one loop-varying operand, we are sure to
9692	// produce a vector of pointers. But if we are only unrolling, we want
9693	// to produce a scalar GEP for each unroll part. Thus, the GEP we
9694	// produce with the code below will be scalar (if VF == 1) or vector
9695	// (otherwise). Note that for the unroll-only case, we still maintain
9696	// values in the vector mapping with initVector, as we do for other
9697	// instructions.
9698	for (unsigned Part = 0; Part < State.UF; ++Part) {
9699	// The pointer operand of the new GEP. If it's loop-invariant, we
9700	// won't broadcast it.
9701	auto *Ptr = IsPtrLoopInvariant
9702	? State.get(getOperand(0), VPIteration(0, 0))
9703	: State.get(getOperand(0), Part);
9704
9705	// Collect all the indices for the new GEP. If any index is
9706	// loop-invariant, we won't broadcast it.
9707	SmallVector<Value *, 4> Indices;
9708	for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9709	VPValue *Operand = getOperand(I);
9710	if (IsIndexLoopInvariant[I - 1])
9711	Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9712	else
9713	Indices.push_back(State.get(Operand, Part));
9714	}
9715
9716	// If the GEP instruction is vectorized and was in a basic block that
9717	// needed predication, we can't propagate the poison-generating 'inbounds'
9718	// flag. The control flow has been linearized and the GEP is no longer
9719	// guarded by the predicate, which could make the 'inbounds' properties to
9720	// no longer hold.
9721	bool IsInBounds =
9722	GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9723
9724	// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9725	// but it should be a vector, otherwise.
9726	auto *NewGEP = IsInBounds
9727	? State.Builder.CreateInBoundsGEP(
9728	GEP->getSourceElementType(), Ptr, Indices)
9729	: State.Builder.CreateGEP(GEP->getSourceElementType(),
9730	Ptr, Indices);
9731	assert((State.VF.isScalar() \|\| NewGEP->getType()->isVectorTy()) &&(static_cast <bool> ((State.VF.isScalar() \|\| NewGEP-> getType()->isVectorTy()) && "NewGEP is not a pointer vector" ) ? void (0) : __assert_fail ("(State.VF.isScalar() \|\| NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9732, __extension__ __PRETTY_FUNCTION__))
9732	"NewGEP is not a pointer vector")(static_cast <bool> ((State.VF.isScalar() \|\| NewGEP-> getType()->isVectorTy()) && "NewGEP is not a pointer vector" ) ? void (0) : __assert_fail ("(State.VF.isScalar() \|\| NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9732, __extension__ __PRETTY_FUNCTION__));
9733	State.set(this, NewGEP, Part);
9734	State.ILV->addMetadata(NewGEP, GEP);
9735	}
9736	}
9737	}
9738
9739	void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9740	assert(!State.Instance && "Int or FP induction being replicated.")(static_cast <bool> (!State.Instance && "Int or FP induction being replicated." ) ? void (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9740, __extension__ __PRETTY_FUNCTION__));
9741	auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9742	State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
9743	}
9744
9745	void VPWidenPHIRecipe::execute(VPTransformState &State) {
9746	State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9747	State);
9748	}
9749
9750	void VPBlendRecipe::execute(VPTransformState &State) {
9751	State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9752	// We know that all PHIs in non-header blocks are converted into
9753	// selects, so we don't have to worry about the insertion order and we
9754	// can just use the builder.
9755	// At this point we generate the predication tree. There may be
9756	// duplications since this is a simple recursive scan, but future
9757	// optimizations will clean it up.
9758
9759	unsigned NumIncoming = getNumIncomingValues();
9760
9761	// Generate a sequence of selects of the form:
9762	// SELECT(Mask3, In3,
9763	// SELECT(Mask2, In2,
9764	// SELECT(Mask1, In1,
9765	// In0)))
9766	// Note that Mask0 is never used: lanes for which no path reaches this phi and
9767	// are essentially undef are taken from In0.
9768	InnerLoopVectorizer::VectorParts Entry(State.UF);
9769	for (unsigned In = 0; In < NumIncoming; ++In) {
9770	for (unsigned Part = 0; Part < State.UF; ++Part) {
9771	// We might have single edge PHIs (blocks) - use an identity
9772	// 'select' for the first PHI operand.
9773	Value *In0 = State.get(getIncomingValue(In), Part);
9774	if (In == 0)
9775	Entry[Part] = In0; // Initialize with the first incoming value.
9776	else {
9777	// Select between the current value and the previous incoming edge
9778	// based on the incoming mask.
9779	Value *Cond = State.get(getMask(In), Part);
9780	Entry[Part] =
9781	State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9782	}
9783	}
9784	}
9785	for (unsigned Part = 0; Part < State.UF; ++Part)
9786	State.set(this, Entry[Part], Part);
9787	}
9788
9789	void VPInterleaveRecipe::execute(VPTransformState &State) {
9790	assert(!State.Instance && "Interleave group being replicated.")(static_cast <bool> (!State.Instance && "Interleave group being replicated." ) ? void (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9790, __extension__ __PRETTY_FUNCTION__));
9791	State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9792	getStoredValues(), getMask());
9793	}
9794
9795	void VPReductionRecipe::execute(VPTransformState &State) {
9796	assert(!State.Instance && "Reduction being replicated.")(static_cast <bool> (!State.Instance && "Reduction being replicated." ) ? void (0) : __assert_fail ("!State.Instance && \"Reduction being replicated.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9796, __extension__ __PRETTY_FUNCTION__));
9797	Value *PrevInChain = State.get(getChainOp(), 0);
9798	RecurKind Kind = RdxDesc->getRecurrenceKind();
9799	bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9800	// Propagate the fast-math flags carried by the underlying instruction.
9801	IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9802	State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9803	for (unsigned Part = 0; Part < State.UF; ++Part) {
9804	Value *NewVecOp = State.get(getVecOp(), Part);
9805	if (VPValue *Cond = getCondOp()) {
9806	Value *NewCond = State.get(Cond, Part);
9807	VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9808	Value *Iden = RdxDesc->getRecurrenceIdentity(
9809	Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9810	Value *IdenVec =
9811	State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9812	Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9813	NewVecOp = Select;
9814	}
9815	Value *NewRed;
9816	Value *NextInChain;
9817	if (IsOrdered) {
9818	if (State.VF.isVector())
9819	NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9820	PrevInChain);
9821	else
9822	NewRed = State.Builder.CreateBinOp(
9823	(Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9824	NewVecOp);
9825	PrevInChain = NewRed;
9826	} else {
9827	PrevInChain = State.get(getChainOp(), Part);
9828	NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9829	}
9830	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9831	NextInChain =
9832	createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9833	NewRed, PrevInChain);
9834	} else if (IsOrdered)
9835	NextInChain = NewRed;
9836	else
9837	NextInChain = State.Builder.CreateBinOp(
9838	(Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9839	PrevInChain);
9840	State.set(this, NextInChain, Part);
9841	}
9842	}
9843
9844	void VPReplicateRecipe::execute(VPTransformState &State) {
9845	if (State.Instance) { // Generate a single instance.
9846	assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")(static_cast <bool> (!State.VF.isScalable() && "Can't scalarize a scalable vector" ) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9846, __extension__ __PRETTY_FUNCTION__));
9847	State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9848	IsPredicated, State);
9849	// Insert scalar instance packing it into a vector.
9850	if (AlsoPack && State.VF.isVector()) {
9851	// If we're constructing lane 0, initialize to start from poison.
9852	if (State.Instance->Lane.isFirstLane()) {
9853	assert(!State.VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!State.VF.isScalable() && "VF is assumed to be non scalable." ) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"VF is assumed to be non scalable.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9853, __extension__ __PRETTY_FUNCTION__));
9854	Value *Poison = PoisonValue::get(
9855	VectorType::get(getUnderlyingValue()->getType(), State.VF));
9856	State.set(this, Poison, State.Instance->Part);
9857	}
9858	State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9859	}
9860	return;
9861	}
9862
9863	// Generate scalar instances for all VF lanes of all UF parts, unless the
9864	// instruction is uniform inwhich case generate only the first lane for each
9865	// of the UF parts.
9866	unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9867	assert((!State.VF.isScalable() \|\| IsUniform) &&(static_cast <bool> ((!State.VF.isScalable() \|\| IsUniform ) && "Can't scalarize a scalable vector") ? void (0) : __assert_fail ("(!State.VF.isScalable() \|\| IsUniform) && \"Can't scalarize a scalable vector\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9868, __extension__ __PRETTY_FUNCTION__))
9868	"Can't scalarize a scalable vector")(static_cast <bool> ((!State.VF.isScalable() \|\| IsUniform ) && "Can't scalarize a scalable vector") ? void (0) : __assert_fail ("(!State.VF.isScalable() \|\| IsUniform) && \"Can't scalarize a scalable vector\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9868, __extension__ __PRETTY_FUNCTION__));
9869	for (unsigned Part = 0; Part < State.UF; ++Part)
9870	for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9871	State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9872	VPIteration(Part, Lane), IsPredicated,
9873	State);
9874	}
9875
9876	void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9877	assert(State.Instance && "Branch on Mask works only on single instance.")(static_cast <bool> (State.Instance && "Branch on Mask works only on single instance." ) ? void (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9877, __extension__ __PRETTY_FUNCTION__));
9878
9879	unsigned Part = State.Instance->Part;
9880	unsigned Lane = State.Instance->Lane.getKnownLane();
9881
9882	Value *ConditionBit = nullptr;
9883	VPValue *BlockInMask = getMask();
9884	if (BlockInMask) {
9885	ConditionBit = State.get(BlockInMask, Part);
9886	if (ConditionBit->getType()->isVectorTy())
9887	ConditionBit = State.Builder.CreateExtractElement(
9888	ConditionBit, State.Builder.getInt32(Lane));
9889	} else // Block in mask is all-one.
9890	ConditionBit = State.Builder.getTrue();
9891
9892	// Replace the temporary unreachable terminator with a new conditional branch,
9893	// whose two destinations will be set later when they are created.
9894	auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9895	assert(isa<UnreachableInst>(CurrentTerminator) &&(static_cast <bool> (isa<UnreachableInst>(CurrentTerminator ) && "Expected to replace unreachable terminator with conditional branch." ) ? void (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9896, __extension__ __PRETTY_FUNCTION__))
9896	"Expected to replace unreachable terminator with conditional branch.")(static_cast <bool> (isa<UnreachableInst>(CurrentTerminator ) && "Expected to replace unreachable terminator with conditional branch." ) ? void (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9896, __extension__ __PRETTY_FUNCTION__));
9897	auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9898	CondBr->setSuccessor(0, nullptr);
9899	ReplaceInstWithInst(CurrentTerminator, CondBr);
9900	}
9901
9902	void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9903	assert(State.Instance && "Predicated instruction PHI works per instance.")(static_cast <bool> (State.Instance && "Predicated instruction PHI works per instance." ) ? void (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9903, __extension__ __PRETTY_FUNCTION__));
9904	Instruction *ScalarPredInst =
9905	cast<Instruction>(State.get(getOperand(0), *State.Instance));
9906	BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9907	BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9908	assert(PredicatingBB && "Predicated block has no single predecessor.")(static_cast <bool> (PredicatingBB && "Predicated block has no single predecessor." ) ? void (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9908, __extension__ __PRETTY_FUNCTION__));
9909	assert(isa<VPReplicateRecipe>(getOperand(0)) &&(static_cast <bool> (isa<VPReplicateRecipe>(getOperand (0)) && "operand must be VPReplicateRecipe") ? void ( 0) : __assert_fail ("isa<VPReplicateRecipe>(getOperand(0)) && \"operand must be VPReplicateRecipe\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9910, __extension__ __PRETTY_FUNCTION__))
9910	"operand must be VPReplicateRecipe")(static_cast <bool> (isa<VPReplicateRecipe>(getOperand (0)) && "operand must be VPReplicateRecipe") ? void ( 0) : __assert_fail ("isa<VPReplicateRecipe>(getOperand(0)) && \"operand must be VPReplicateRecipe\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9910, __extension__ __PRETTY_FUNCTION__));
9911
9912	// By current pack/unpack logic we need to generate only a single phi node: if
9913	// a vector value for the predicated instruction exists at this point it means
9914	// the instruction has vector users only, and a phi for the vector value is
9915	// needed. In this case the recipe of the predicated instruction is marked to
9916	// also do that packing, thereby "hoisting" the insert-element sequence.
9917	// Otherwise, a phi node for the scalar value is needed.
9918	unsigned Part = State.Instance->Part;
9919	if (State.hasVectorValue(getOperand(0), Part)) {
9920	Value *VectorValue = State.get(getOperand(0), Part);
9921	InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9922	PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9923	VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9924	VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9925	if (State.hasVectorValue(this, Part))
9926	State.reset(this, VPhi, Part);
9927	else
9928	State.set(this, VPhi, Part);
9929	// NOTE: Currently we need to update the value of the operand, so the next
9930	// predicated iteration inserts its generated value in the correct vector.
9931	State.reset(getOperand(0), VPhi, Part);
9932	} else {
9933	Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9934	PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9935	Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9936	PredicatingBB);
9937	Phi->addIncoming(ScalarPredInst, PredicatedBB);
9938	if (State.hasScalarValue(this, *State.Instance))
9939	State.reset(this, Phi, *State.Instance);
9940	else
9941	State.set(this, Phi, *State.Instance);
9942	// NOTE: Currently we need to update the value of the operand, so the next
9943	// predicated iteration inserts its generated value in the correct vector.
9944	State.reset(getOperand(0), Phi, *State.Instance);
9945	}
9946	}
9947
9948	void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9949	VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9950
9951	// Attempt to issue a wide load.
9952	LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9953	StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9954
9955	assert((LI \|\| SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI \|\| SI) && "Invalid Load/Store instruction" ) ? void (0) : __assert_fail ("(LI \|\| SI) && \"Invalid Load/Store instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9955, __extension__ __PRETTY_FUNCTION__));
9956	assert((!SI \|\| StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI \|\| StoredValue) && "No stored value provided for widened store" ) ? void (0) : __assert_fail ("(!SI \|\| StoredValue) && \"No stored value provided for widened store\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9956, __extension__ __PRETTY_FUNCTION__));
9957	assert((!LI \|\| !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI \|\| !StoredValue) && "Stored value provided for widened load" ) ? void (0) : __assert_fail ("(!LI \|\| !StoredValue) && \"Stored value provided for widened load\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9957, __extension__ __PRETTY_FUNCTION__));
9958
9959	Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9960
9961	auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9962	const Align Alignment = getLoadStoreAlignment(&Ingredient);
9963	bool CreateGatherScatter = !Consecutive;
9964
9965	auto &Builder = State.Builder;
9966	InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9967	bool isMaskRequired = getMask();
9968	if (isMaskRequired)
9969	for (unsigned Part = 0; Part < State.UF; ++Part)
9970	BlockInMaskParts[Part] = State.get(getMask(), Part);
9971
9972	const auto CreateVecPtr = [&](unsigned Part, Value Ptr) -> Value {
9973	// Calculate the pointer for the specific unroll-part.
9974	GetElementPtrInst *PartPtr = nullptr;
9975
9976	bool InBounds = false;
9977	if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9978	InBounds = gep->isInBounds();
9979	if (Reverse) {
9980	// If the address is consecutive but reversed, then the
9981	// wide store needs to start at the last vector element.
9982	// RunTimeVF = VScale * VF.getKnownMinValue()
9983	// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9984	Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9985	// NumElt = -Part * RunTimeVF
9986	Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9987	// LastLane = 1 - RunTimeVF
9988	Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9989	PartPtr =
9990	cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9991	PartPtr->setIsInBounds(InBounds);
9992	PartPtr = cast<GetElementPtrInst>(
9993	Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9994	PartPtr->setIsInBounds(InBounds);
9995	if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9996	BlockInMaskParts[Part] =
9997	Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9998	} else {
9999	Value *Increment =
10000	createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
10001	PartPtr = cast<GetElementPtrInst>(
10002	Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
10003	PartPtr->setIsInBounds(InBounds);
10004	}
10005
10006	unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
10007	return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
10008	};
10009
10010	// Handle Stores:
10011	if (SI) {
10012	State.ILV->setDebugLocFromInst(SI);
10013
10014	for (unsigned Part = 0; Part < State.UF; ++Part) {
10015	Instruction *NewSI = nullptr;
10016	Value *StoredVal = State.get(StoredValue, Part);
10017	if (CreateGatherScatter) {
10018	Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10019	Value *VectorGep = State.get(getAddr(), Part);
10020	NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
10021	MaskPart);
10022	} else {
10023	if (Reverse) {
10024	// If we store to reverse consecutive memory locations, then we need
10025	// to reverse the order of elements in the stored value.
10026	StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10027	// We don't want to update the value in the map as it might be used in
10028	// another expression. So don't call resetVectorValue(StoredVal).
10029	}
10030	auto *VecPtr =
10031	CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10032	if (isMaskRequired)
10033	NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10034	BlockInMaskParts[Part]);
10035	else
10036	NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10037	}
10038	State.ILV->addMetadata(NewSI, SI);
10039	}
10040	return;
10041	}
10042
10043	// Handle loads.
10044	assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction" ) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10044, __extension__ __PRETTY_FUNCTION__));
10045	State.ILV->setDebugLocFromInst(LI);
10046	for (unsigned Part = 0; Part < State.UF; ++Part) {
10047	Value *NewLI;
10048	if (CreateGatherScatter) {
10049	Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10050	Value *VectorGep = State.get(getAddr(), Part);
10051	NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10052	nullptr, "wide.masked.gather");
10053	State.ILV->addMetadata(NewLI, LI);
10054	} else {
10055	auto *VecPtr =
10056	CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10057	if (isMaskRequired)
10058	NewLI = Builder.CreateMaskedLoad(
10059	DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10060	PoisonValue::get(DataTy), "wide.masked.load");
10061	else
10062	NewLI =
10063	Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10064
10065	// Add metadata to the load, but setVectorValue to the reverse shuffle.
10066	State.ILV->addMetadata(NewLI, LI);
10067	if (Reverse)
10068	NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10069	}
10070
10071	State.set(this, NewLI, Part);
10072	}
10073	}
10074
10075	// Determine how to lower the scalar epilogue, which depends on 1) optimising
10076	// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10077	// predication, and 4) a TTI hook that analyses whether the loop is suitable
10078	// for predication.
10079	static ScalarEpilogueLowering getScalarEpilogueLowering(
10080	Function F, Loop L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10081	BlockFrequencyInfo BFI, TargetTransformInfo TTI, TargetLibraryInfo *TLI,
10082	AssumptionCache AC, LoopInfo LI, ScalarEvolution SE, DominatorTree DT,
10083	LoopVectorizationLegality &LVL) {
10084	// 1) OptSize takes precedence over all other options, i.e. if this is set,
10085	// don't look at hints or options, and don't request a scalar epilogue.
10086	// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10087	// LoopAccessInfo (due to code dependency and not being able to reliably get
10088	// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10089	// of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10090	// versioning when the vectorization is forced, unlike hasOptSize. So revert
10091	// back to the old way and vectorize with versioning when forced. See D81345.)
10092	if (F->hasOptSize() \|\| (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10093	PGSOQueryType::IRPass) &&
10094	Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10095	return CM_ScalarEpilogueNotAllowedOptSize;
10096
10097	// 2) If set, obey the directives
10098	if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10099	switch (PreferPredicateOverEpilogue) {
10100	case PreferPredicateTy::ScalarEpilogue:
10101	return CM_ScalarEpilogueAllowed;
10102	case PreferPredicateTy::PredicateElseScalarEpilogue:
10103	return CM_ScalarEpilogueNotNeededUsePredicate;
10104	case PreferPredicateTy::PredicateOrDontVectorize:
10105	return CM_ScalarEpilogueNotAllowedUsePredicate;
10106	};
10107	}
10108
10109	// 3) If set, obey the hints
10110	switch (Hints.getPredicate()) {
10111	case LoopVectorizeHints::FK_Enabled:
10112	return CM_ScalarEpilogueNotNeededUsePredicate;
10113	case LoopVectorizeHints::FK_Disabled:
10114	return CM_ScalarEpilogueAllowed;
10115	};
10116
10117	// 4) if the TTI hook indicates this is profitable, request predication.
10118	if (TTI->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT,
10119	LVL.getLAI()))
10120	return CM_ScalarEpilogueNotNeededUsePredicate;
10121
10122	return CM_ScalarEpilogueAllowed;
10123	}
10124
10125	Value VPTransformState::get(VPValue Def, unsigned Part) {
10126	// If Values have been set for this Def return the one relevant for \p Part.
10127	if (hasVectorValue(Def, Part))
10128	return Data.PerPartOutput[Def][Part];
10129
10130	if (!hasScalarValue(Def, {Part, 0})) {
10131	Value *IRV = Def->getLiveInIRValue();
10132	Value *B = ILV->getBroadcastInstrs(IRV);
10133	set(Def, B, Part);
10134	return B;
10135	}
10136
10137	Value *ScalarValue = get(Def, {Part, 0});
10138	// If we aren't vectorizing, we can just copy the scalar map values over
10139	// to the vector map.
10140	if (VF.isScalar()) {
10141	set(Def, ScalarValue, Part);
10142	return ScalarValue;
10143	}
10144
10145	auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10146	bool IsUniform = RepR && RepR->isUniform();
10147
10148	unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10149	// Check if there is a scalar value for the selected lane.
10150	if (!hasScalarValue(Def, {Part, LastLane})) {
10151	// At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10152	assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&(static_cast <bool> (isa<VPWidenIntOrFpInductionRecipe >(Def->getDef()) && "unexpected recipe found to be invariant" ) ? void (0) : __assert_fail ("isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && \"unexpected recipe found to be invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10153, __extension__ __PRETTY_FUNCTION__))
10153	"unexpected recipe found to be invariant")(static_cast <bool> (isa<VPWidenIntOrFpInductionRecipe >(Def->getDef()) && "unexpected recipe found to be invariant" ) ? void (0) : __assert_fail ("isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && \"unexpected recipe found to be invariant\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10153, __extension__ __PRETTY_FUNCTION__));
10154	IsUniform = true;
10155	LastLane = 0;
10156	}
10157
10158	auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10159	// Set the insert point after the last scalarized instruction or after the
10160	// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10161	// will directly follow the scalar definitions.
10162	auto OldIP = Builder.saveIP();
10163	auto NewIP =
10164	isa<PHINode>(LastInst)
10165	? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10166	: std::next(BasicBlock::iterator(LastInst));
10167	Builder.SetInsertPoint(&*NewIP);
10168
10169	// However, if we are vectorizing, we need to construct the vector values.
10170	// If the value is known to be uniform after vectorization, we can just
10171	// broadcast the scalar value corresponding to lane zero for each unroll
10172	// iteration. Otherwise, we construct the vector values using
10173	// insertelement instructions. Since the resulting vectors are stored in
10174	// State, we will only generate the insertelements once.
10175	Value *VectorValue = nullptr;
10176	if (IsUniform) {
10177	VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10178	set(Def, VectorValue, Part);
10179	} else {
10180	// Initialize packing with insertelements to start from undef.
10181	assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable." ) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10181, __extension__ __PRETTY_FUNCTION__));
10182	Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10183	set(Def, Undef, Part);
10184	for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10185	ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10186	VectorValue = get(Def, Part);
10187	}
10188	Builder.restoreIP(OldIP);
10189	return VectorValue;
10190	}
10191
10192	// Process the loop in the VPlan-native vectorization path. This path builds
10193	// VPlan upfront in the vectorization pipeline, which allows to apply
10194	// VPlan-to-VPlan transformations from the very beginning without modifying the
10195	// input LLVM IR.
10196	static bool processLoopInVPlanNativePath(
10197	Loop L, PredicatedScalarEvolution &PSE, LoopInfo LI, DominatorTree *DT,
10198	LoopVectorizationLegality LVL, TargetTransformInfo TTI,
10199	TargetLibraryInfo TLI, DemandedBits DB, AssumptionCache *AC,
10200	OptimizationRemarkEmitter ORE, BlockFrequencyInfo BFI,
10201	ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10202	LoopVectorizationRequirements &Requirements) {
10203
10204	if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10205	LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: cannot compute the outer-loop trip count\n" ; } } while (false);
10206	return false;
10207	}
10208	assert(EnableVPlanNativePath && "VPlan-native path is disabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is disabled." ) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10208, __extension__ __PRETTY_FUNCTION__));
10209	Function *F = L->getHeader()->getParent();
10210	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10211
10212	ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10213	F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10214
10215	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10216	&Hints, IAI);
10217	// Use the planner for outer loop vectorization.
10218	// TODO: CM is not used at this point inside the planner. Turn CM into an
10219	// optional argument if we don't need it in the future.
10220	LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10221	Requirements, ORE);
10222
10223	// Get user vectorization factor.
10224	ElementCount UserVF = Hints.getWidth();
10225
10226	CM.collectElementTypesForWidening();
10227
10228	// Plan how to best vectorize, return the best VF and its cost.
10229	const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10230
10231	// If we are stress testing VPlan builds, do not attempt to generate vector
10232	// code. Masked vector code generation support will follow soon.
10233	// Also, do not attempt to vectorize if no vector code will be produced.
10234	if (VPlanBuildStressTest \|\| EnableVPlanPredication \|\|
10235	VectorizationFactor::Disabled() == VF)
10236	return false;
10237
10238	VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10239
10240	{
10241	GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10242	F->getParent()->getDataLayout());
10243	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10244	&CM, BFI, PSI, Checks);
10245	LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"; } } while (false)
10246	<< L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"; } } while (false);
10247	LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10248	}
10249
10250	// Mark the loop as already vectorized to avoid vectorizing again.
10251	Hints.setAlreadyVectorized();
10252	assert(!verifyFunction(L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(L->getHeader() ->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10252, __extension__ __PRETTY_FUNCTION__));
10253	return true;
10254	}
10255
10256	// Emit a remark if there are stores to floats that required a floating point
10257	// extension. If the vectorized loop was generated with floating point there
10258	// will be a performance penalty from the conversion overhead and the change in
10259	// the vector width.
10260	static void checkMixedPrecision(Loop L, OptimizationRemarkEmitter ORE) {
10261	SmallVector<Instruction *, 4> Worklist;
10262	for (BasicBlock *BB : L->getBlocks()) {
10263	for (Instruction &Inst : *BB) {
10264	if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10265	if (S->getValueOperand()->getType()->isFloatTy())
10266	Worklist.push_back(S);
10267	}
10268	}
10269	}
10270
10271	// Traverse the floating point stores upwards searching, for floating point
10272	// conversions.
10273	SmallPtrSet<const Instruction *, 4> Visited;
10274	SmallPtrSet<const Instruction *, 4> EmittedRemark;
10275	while (!Worklist.empty()) {
10276	auto *I = Worklist.pop_back_val();
10277	if (!L->contains(I))
10278	continue;
10279	if (!Visited.insert(I).second)
10280	continue;
10281
10282	// Emit a remark if the floating point store required a floating
10283	// point conversion.
10284	// TODO: More work could be done to identify the root cause such as a
10285	// constant or a function return type and point the user to it.
10286	if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10287	ORE->emit([&]() {
10288	return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", "VectorMixedPrecision",
10289	I->getDebugLoc(), L->getHeader())
10290	<< "floating point conversion changes vector width. "
10291	<< "Mixed floating point precision requires an up/down "
10292	<< "cast that will negatively impact performance.";
10293	});
10294
10295	for (Use &Op : I->operands())
10296	if (auto *OpI = dyn_cast<Instruction>(Op))
10297	Worklist.push_back(OpI);
10298	}
10299	}
10300
10301	LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10302	: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced \|\|
10303	!EnableLoopInterleaving),
10304	VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced \|\|
10305	!EnableLoopVectorization) {}
10306
10307	bool LoopVectorizePass::processLoop(Loop *L) {
10308	assert((EnableVPlanNativePath \|\| L->isInnermost()) &&(static_cast <bool> ((EnableVPlanNativePath \|\| L->isInnermost ()) && "VPlan-native path is not enabled. Only process inner loops." ) ? void (0) : __assert_fail ("(EnableVPlanNativePath \|\| L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10309, __extension__ __PRETTY_FUNCTION__))
10309	"VPlan-native path is not enabled. Only process inner loops.")(static_cast <bool> ((EnableVPlanNativePath \|\| L->isInnermost ()) && "VPlan-native path is not enabled. Only process inner loops." ) ? void (0) : __assert_fail ("(EnableVPlanNativePath \|\| L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10309, __extension__ __PRETTY_FUNCTION__));
10310
10311	#ifndef NDEBUG
10312	const std::string DebugLocStr = getDebugLocString(L);
10313	#endif /* NDEBUG */
10314
10315	LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (false )
10316	<< L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (false )
10317	<< DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (false );
10318
10319	LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10320
10321	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10322	dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10323	<< " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10324	<< (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10325	? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10326	: (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10327	? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10328	: "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10329	<< " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false)
10330	<< " interleave=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " interleave=" << Hints.getInterleave () << "\n"; } } while (false);
10331
10332	// Function containing loop
10333	Function *F = L->getHeader()->getParent();
10334
10335	// Looking at the diagnostic output is the only way to determine if a loop
10336	// was vectorized (other than looking at the IR or machine code), so it
10337	// is important to generate an optimization remark for each loop. Most of
10338	// these messages are generated as OptimizationRemarkAnalysis. Remarks
10339	// generated as OptimizationRemark and OptimizationRemarkMissed are
10340	// less verbose reporting vectorized loops and unvectorized loops that may
10341	// benefit from vectorization, respectively.
10342
10343	if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10344	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n" ; } } while (false);
10345	return false;
10346	}
10347
10348	PredicatedScalarEvolution PSE(SE, L);
10349
10350	// Check if it is legal to vectorize the loop.
10351	LoopVectorizationRequirements Requirements;
10352	LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10353	&Requirements, &Hints, DB, AC, BFI, PSI);
10354	if (!LVL.canVectorize(EnableVPlanNativePath)) {
10355	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ; } } while (false);
10356	Hints.emitRemarkWithHints();
10357	return false;
10358	}
10359
10360	// Check the function attributes and profiles to find out if this function
10361	// should be optimized for size.
10362	ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10363	F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10364
10365	// Entrance to the VPlan-native vectorization path. Outer loops are processed
10366	// here. They may require CFG and instruction level transformations before
10367	// even evaluating whether vectorization is profitable. Since we cannot modify
10368	// the incoming IR, we need to build VPlan upfront in the vectorization
10369	// pipeline.
10370	if (!L->isInnermost())
10371	return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10372	ORE, BFI, PSI, Hints, Requirements);
10373
10374	assert(L->isInnermost() && "Inner loop expected.")(static_cast <bool> (L->isInnermost() && "Inner loop expected." ) ? void (0) : __assert_fail ("L->isInnermost() && \"Inner loop expected.\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10374, __extension__ __PRETTY_FUNCTION__));
10375
10376	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
10377	// count by optimizing for size, to minimize overheads.
10378	auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10379	if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10380	LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."; } } while (false )
10381	<< "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."; } } while (false )
10382	<< "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."; } } while (false );
10383	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10384	LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n" ; } } while (false);
10385	else {
10386	LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\n"; } } while (false);
10387	SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10388	}
10389	}
10390
10391	// Check the function attributes to see if implicit floats are allowed.
10392	// FIXME: This check doesn't seem possibly correct -- what if the loop is
10393	// an integer loop and the vector instructions selected are purely integer
10394	// vector instructions?
10395	if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10396	reportVectorizationFailure(
10397	"Can't vectorize when the NoImplicitFloat attribute is used",
10398	"loop not vectorized due to NoImplicitFloat attribute",
10399	"NoImplicitFloat", ORE, L);
10400	Hints.emitRemarkWithHints();
10401	return false;
10402	}
10403
10404	// Check if the target supports potentially unsafe FP vectorization.
10405	// FIXME: Add a check for the type of safety issue (denormal, signaling)
10406	// for the target we're vectorizing for, to make sure none of the
10407	// additional fp-math flags can help.
10408	if (Hints.isPotentiallyUnsafe() &&
10409	TTI->isFPVectorizationPotentiallyUnsafe()) {
10410	reportVectorizationFailure(
10411	"Potentially unsafe FP op prevents vectorization",
10412	"loop not vectorized due to unsafe FP support.",
10413	"UnsafeFP", ORE, L);
10414	Hints.emitRemarkWithHints();
10415	return false;
10416	}
10417
10418	bool AllowOrderedReductions;
10419	// If the flag is set, use that instead and override the TTI behaviour.
10420	if (ForceOrderedReductions.getNumOccurrences() > 0)
10421	AllowOrderedReductions = ForceOrderedReductions;
10422	else
10423	AllowOrderedReductions = TTI->enableOrderedReductions();
10424	if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10425	ORE->emit([&]() {
10426	auto *ExactFPMathInst = Requirements.getExactFPInst();
10427	return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE"loop-vectorize", "CantReorderFPOps",
10428	ExactFPMathInst->getDebugLoc(),
10429	ExactFPMathInst->getParent())
10430	<< "loop not vectorized: cannot prove it is safe to reorder "
10431	"floating-point operations";
10432	});
10433	LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to " "reorder floating-point operations\n"; } } while (false)
10434	"reorder floating-point operations\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to " "reorder floating-point operations\n"; } } while (false);
10435	Hints.emitRemarkWithHints();
10436	return false;
10437	}
10438
10439	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10440	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10441
10442	// If an override option has been passed in for interleaved accesses, use it.
10443	if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10444	UseInterleaved = EnableInterleavedMemAccesses;
10445
10446	// Analyze interleaved memory accesses.
10447	if (UseInterleaved) {
10448	IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10449	}
10450
10451	// Use the cost model.
10452	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10453	F, &Hints, IAI);
10454	CM.collectValuesToIgnore();
10455	CM.collectElementTypesForWidening();
10456
10457	// Use the planner for vectorization.
10458	LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10459	Requirements, ORE);
10460
10461	// Get user vectorization factor and interleave count.
10462	ElementCount UserVF = Hints.getWidth();
10463	unsigned UserIC = Hints.getInterleave();
10464
10465	// Plan how to best vectorize, return the best VF and its cost.
10466	Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10467
10468	VectorizationFactor VF = VectorizationFactor::Disabled();
10469	unsigned IC = 1;
10470
10471	if (MaybeVF) {
10472	VF = *MaybeVF;
10473	// Select the interleave count.
10474	IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10475	}
10476
10477	// Identify the diagnostic messages that should be produced.
10478	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10479	bool VectorizeLoop = true, InterleaveLoop = true;
10480	if (VF.Width.isScalar()) {
10481	LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n" ; } } while (false);
10482	VecDiagMsg = std::make_pair(
10483	"VectorizationNotBeneficial",
10484	"the cost-model indicates that vectorization is not beneficial");
10485	VectorizeLoop = false;
10486	}
10487
10488	if (!MaybeVF && UserIC > 1) {
10489	// Tell the user interleaving was avoided up-front, despite being explicitly
10490	// requested.
10491	LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and " "interleaving should be avoided up front\n"; } } while (false )
10492	"interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and " "interleaving should be avoided up front\n"; } } while (false );
10493	IntDiagMsg = std::make_pair(
10494	"InterleavingAvoided",
10495	"Ignoring UserIC, because interleaving was avoided up front");
10496	InterleaveLoop = false;
10497	} else if (IC == 1 && UserIC <= 1) {
10498	// Tell the user interleaving is not beneficial.
10499	LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n" ; } } while (false);
10500	IntDiagMsg = std::make_pair(
10501	"InterleavingNotBeneficial",
10502	"the cost-model indicates that interleaving is not beneficial");
10503	InterleaveLoop = false;
10504	if (UserIC == 1) {
10505	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10506	IntDiagMsg.second +=
10507	" and is explicitly disabled or interleave count is set to 1";
10508	}
10509	} else if (IC > 1 && UserIC == 1) {
10510	// Tell the user interleaving is beneficial, but it explicitly disabled.
10511	LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled." ; } } while (false)
10512	dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled." ; } } while (false);
10513	IntDiagMsg = std::make_pair(
10514	"InterleavingBeneficialButDisabled",
10515	"the cost-model indicates that interleaving is beneficial "
10516	"but is explicitly disabled or interleave count is set to 1");
10517	InterleaveLoop = false;
10518	}
10519
10520	// Override IC if user provided an interleave count.
10521	IC = UserIC > 0 ? UserIC : IC;
10522
10523	// Emit diagnostic messages, if any.
10524	const char *VAPassName = Hints.vectorizeAnalysisPassName();
10525	if (!VectorizeLoop && !InterleaveLoop) {
10526	// Do not vectorize or interleaving the loop.
10527	ORE->emit([&]() {
10528	return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10529	L->getStartLoc(), L->getHeader())
10530	<< VecDiagMsg.second;
10531	});
10532	ORE->emit([&]() {
10533	return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
10534	L->getStartLoc(), L->getHeader())
10535	<< IntDiagMsg.second;
10536	});
10537	return false;
10538	} else if (!VectorizeLoop && InterleaveLoop) {
10539	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleave Count is " << IC << '\n'; } } while (false);
10540	ORE->emit([&]() {
10541	return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10542	L->getStartLoc(), L->getHeader())
10543	<< VecDiagMsg.second;
10544	});
10545	} else if (VectorizeLoop && !InterleaveLoop) {
10546	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (false)
10547	<< ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (false);
10548	ORE->emit([&]() {
10549	return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
10550	L->getStartLoc(), L->getHeader())
10551	<< IntDiagMsg.second;
10552	});
10553	} else if (VectorizeLoop && InterleaveLoop) {
10554	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (false)
10555	<< ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (false);
10556	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Interleave Count is " << IC << '\n'; } } while (false);
10557	}
10558
10559	bool DisableRuntimeUnroll = false;
10560	MDNode *OrigLoopID = L->getLoopID();
10561	{
10562	// Optimistically generate runtime checks. Drop them if they turn out to not
10563	// be profitable. Limit the scope of Checks, so the cleanup happens
10564	// immediately after vector codegeneration is done.
10565	GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10566	F->getParent()->getDataLayout());
10567	if (!VF.Width.isScalar() \|\| IC > 1)
10568	Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10569
10570	using namespace ore;
10571	if (!VectorizeLoop) {
10572	assert(IC > 1 && "interleave count should not be 1 or 0")(static_cast <bool> (IC > 1 && "interleave count should not be 1 or 0" ) ? void (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\"" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10572, __extension__ __PRETTY_FUNCTION__));
10573	// If we decided that it is not legal to vectorize the loop, then
10574	// interleave it.
10575	InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10576	&CM, BFI, PSI, Checks);
10577
10578	VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10579	LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10580
10581	ORE->emit([&]() {
10582	return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
10583	L->getHeader())
10584	<< "interleaved loop (interleaved count: "
10585	<< NV("InterleaveCount", IC) << ")";
10586	});
10587	} else {
10588	// If we decided that it is legal to vectorize the loop, then do it.
10589
10590	// Consider vectorizing the epilogue too if it's profitable.
10591	VectorizationFactor EpilogueVF =
10592	CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10593	if (EpilogueVF.Width.isVector()) {
10594
10595	// The first pass vectorizes the main loop and creates a scalar epilogue
10596	// to be vectorized by executing the plan (potentially with a different
10597	// factor) again shortly afterwards.
10598	EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10599	EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10600	EPI, &LVL, &CM, BFI, PSI, Checks);
10601
10602	VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10603	LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10604	DT);
10605	++LoopsVectorized;
10606
10607	simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10608	formLCSSARecursively(L, DT, LI, SE);
10609
10610	// Second pass vectorizes the epilogue and adjusts the control flow
10611	// edges from the first pass.
10612	EPI.MainLoopVF = EPI.EpilogueVF;
10613	EPI.MainLoopUF = EPI.EpilogueUF;
10614	EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10615	ORE, EPI, &LVL, &CM, BFI, PSI,
10616	Checks);
10617
10618	VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10619
10620	// Ensure that the start values for any VPReductionPHIRecipes are
10621	// updated before vectorising the epilogue loop.
10622	VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10623	for (VPRecipeBase &R : Header->phis()) {
10624	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10625	if (auto *Resume = MainILV.getReductionResumeValue(
10626	ReductionPhi->getRecurrenceDescriptor())) {
10627	VPValue *StartVal = new VPValue(Resume);
10628	BestEpiPlan.addExternalDef(StartVal);
10629	ReductionPhi->setOperand(0, StartVal);
10630	}
10631	}
10632	}
10633
10634	LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10635	DT);
10636	++LoopsEpilogueVectorized;
10637
10638	if (!MainILV.areSafetyChecksAdded())
10639	DisableRuntimeUnroll = true;
10640	} else {
10641	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10642	&LVL, &CM, BFI, PSI, Checks);
10643
10644	VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10645	LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10646	++LoopsVectorized;
10647
10648	// Add metadata to disable runtime unrolling a scalar loop when there
10649	// are no runtime checks about strides and memory. A scalar loop that is
10650	// rarely used is not worth unrolling.
10651	if (!LB.areSafetyChecksAdded())
10652	DisableRuntimeUnroll = true;
10653	}
10654	// Report the vectorization decision.
10655	ORE->emit([&]() {
10656	return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
10657	L->getHeader())
10658	<< "vectorized loop (vectorization width: "
10659	<< NV("VectorizationFactor", VF.Width)
10660	<< ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10661	});
10662	}
10663
10664	if (ORE->allowExtraAnalysis(LV_NAME"loop-vectorize"))
10665	checkMixedPrecision(L, ORE);
10666	}
10667
10668	Optional<MDNode *> RemainderLoopID =
10669	makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10670	LLVMLoopVectorizeFollowupEpilogue});
10671	if (RemainderLoopID.hasValue()) {
10672	L->setLoopID(RemainderLoopID.getValue());
10673	} else {
10674	if (DisableRuntimeUnroll)
10675	AddRuntimeUnrollDisableMetaData(L);
10676
10677	// Mark the loop as already vectorized to avoid vectorizing again.
10678	Hints.setAlreadyVectorized();
10679	}
10680
10681	assert(!verifyFunction(L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(L->getHeader() ->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())" , "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10681, __extension__ __PRETTY_FUNCTION__));
10682	return true;
10683	}
10684
10685	LoopVectorizeResult LoopVectorizePass::runImpl(
10686	Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10687	DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10688	DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10689	std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10690	OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10691	SE = &SE_;
10692	LI = &LI_;
10693	TTI = &TTI_;
10694	DT = &DT_;
10695	BFI = &BFI_;
10696	TLI = TLI_;
10697	AA = &AA_;
10698	AC = &AC_;
10699	GetLAA = &GetLAA_;
10700	DB = &DB_;
10701	ORE = &ORE_;
10702	PSI = PSI_;
10703
10704	// Don't attempt if
10705	// 1. the target claims to have no vector registers, and
10706	// 2. interleaving won't help ILP.
10707	//
10708	// The second condition is necessary because, even if the target has no
10709	// vector registers, loop vectorization may still enable scalar
10710	// interleaving.
10711	if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10712	TTI->getMaxInterleaveFactor(1) < 2)
10713	return LoopVectorizeResult(false, false);
10714
10715	bool Changed = false, CFGChanged = false;
10716
10717	// The vectorizer requires loops to be in simplified form.
10718	// Since simplification may add new inner loops, it has to run before the
10719	// legality and profitability checks. This means running the loop vectorizer
10720	// will simplify all loops, regardless of whether anything end up being
10721	// vectorized.
10722	for (auto &L : *LI)
10723	Changed \|= CFGChanged \|=
10724	simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10725
10726	// Build up a worklist of inner-loops to vectorize. This is necessary as
10727	// the act of vectorizing or partially unrolling a loop creates new loops
10728	// and can invalidate iterators across the loops.
10729	SmallVector<Loop *, 8> Worklist;
10730
10731	for (Loop L : LI)
10732	collectSupportedLoops(*L, LI, ORE, Worklist);
10733
10734	LoopsAnalyzed += Worklist.size();
10735
10736	// Now walk the identified inner loops.
10737	while (!Worklist.empty()) {
10738	Loop *L = Worklist.pop_back_val();
10739
10740	// For the inner loops we actually process, form LCSSA to simplify the
10741	// transform.
10742	Changed \|= formLCSSARecursively(L, DT, LI, SE);
10743
10744	Changed \|= CFGChanged \|= processLoop(L);
10745	}
10746
10747	// Process each loop nest in the function.
10748	return LoopVectorizeResult(Changed, CFGChanged);
10749	}
10750
10751	PreservedAnalyses LoopVectorizePass::run(Function &F,
10752	FunctionAnalysisManager &AM) {
10753	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10754	auto &LI = AM.getResult<LoopAnalysis>(F);
10755	auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10756	auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10757	auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10758	auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10759	auto &AA = AM.getResult<AAManager>(F);
10760	auto &AC = AM.getResult<AssumptionAnalysis>(F);
10761	auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10762	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10763
10764	auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10765	std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10766	[&](Loop &L) -> const LoopAccessInfo & {
10767	LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
10768	TLI, TTI, nullptr, nullptr, nullptr};
10769	return LAM.getResult<LoopAccessAnalysis>(L, AR);
10770	};
10771	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10772	ProfileSummaryInfo *PSI =
10773	MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10774	LoopVectorizeResult Result =
10775	runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10776	if (!Result.MadeAnyChange)
10777	return PreservedAnalyses::all();
10778	PreservedAnalyses PA;
10779
10780	// We currently do not preserve loopinfo/dominator analyses with outer loop
10781	// vectorization. Until this is addressed, mark these analyses as preserved
10782	// only for non-VPlan-native path.
10783	// TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10784	if (!EnableVPlanNativePath) {
10785	PA.preserve<LoopAnalysis>();
10786	PA.preserve<DominatorTreeAnalysis>();
10787	}
10788
10789	if (Result.MadeCFGChange) {
10790	// Making CFG changes likely means a loop got vectorized. Indicate that
10791	// extra simplification passes should be run.
10792	// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10793	// be run if runtime checks have been added.
10794	AM.getResult<ShouldRunExtraVectorPasses>(F);
10795	PA.preserve<ShouldRunExtraVectorPasses>();
10796	} else {
10797	PA.preserveSet<CFGAnalyses>();
10798	}
10799	return PA;
10800	}
10801
10802	void LoopVectorizePass::printPipeline(
10803	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10804	static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10805	OS, MapClassName2PassName);
10806
10807	OS << "<";
10808	OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10809	OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10810	OS << ">";
10811	}