/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	lib/Transforms/Vectorize/LoopVectorize.cpp
Location:	line 1199, column 5
Description:	Value stored to 'LoopID' is never read

Annotated Source Code

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// The LLVM Compiler Infrastructure
4	//
5	// This file is distributed under the University of Illinois Open Source
6	// License. See LICENSE.TXT for details.
7	//
8	//===----------------------------------------------------------------------===//
9	//
10	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11	// and generates target-independent LLVM-IR.
12	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13	// of instructions in order to estimate the profitability of vectorization.
14	//
15	// The loop vectorizer combines consecutive loop iterations into a single
16	// 'wide' iteration. After this transformation the index is incremented
17	// by the SIMD vector width, and not by one.
18	//
19	// This pass has three parts:
20	// 1. The main loop pass that drives the different parts.
21	// 2. LoopVectorizationLegality - A unit that checks for the legality
22	// of the vectorization.
23	// 3. InnerLoopVectorizer - A unit that performs the actual
24	// widening of instructions.
25	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26	// of vectorization. It decides on the optimal vector width, which
27	// can be one, if vectorization is not profitable.
28	//
29	//===----------------------------------------------------------------------===//
30	//
31	// The reduction-variable vectorization is based on the paper:
32	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33	//
34	// Variable uniformity checks are inspired by:
35	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
36	//
37	// Other ideas/concepts are from:
38	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
39	//
40	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
41	// Vectorizing Compilers.
42	//
43	//===----------------------------------------------------------------------===//
44
45	#include "llvm/Transforms/Vectorize.h"
46	#include "llvm/ADT/DenseMap.h"
47	#include "llvm/ADT/EquivalenceClasses.h"
48	#include "llvm/ADT/Hashing.h"
49	#include "llvm/ADT/MapVector.h"
50	#include "llvm/ADT/SetVector.h"
51	#include "llvm/ADT/SmallPtrSet.h"
52	#include "llvm/ADT/SmallSet.h"
53	#include "llvm/ADT/SmallVector.h"
54	#include "llvm/ADT/Statistic.h"
55	#include "llvm/ADT/StringExtras.h"
56	#include "llvm/Analysis/AliasAnalysis.h"
57	#include "llvm/Analysis/AliasSetTracker.h"
58	#include "llvm/Analysis/AssumptionTracker.h"
59	#include "llvm/Analysis/BlockFrequencyInfo.h"
60	#include "llvm/Analysis/CodeMetrics.h"
61	#include "llvm/Analysis/LoopInfo.h"
62	#include "llvm/Analysis/LoopIterator.h"
63	#include "llvm/Analysis/LoopPass.h"
64	#include "llvm/Analysis/ScalarEvolution.h"
65	#include "llvm/Analysis/ScalarEvolutionExpander.h"
66	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
67	#include "llvm/Analysis/TargetTransformInfo.h"
68	#include "llvm/Analysis/ValueTracking.h"
69	#include "llvm/IR/Constants.h"
70	#include "llvm/IR/DataLayout.h"
71	#include "llvm/IR/DebugInfo.h"
72	#include "llvm/IR/DerivedTypes.h"
73	#include "llvm/IR/DiagnosticInfo.h"
74	#include "llvm/IR/Dominators.h"
75	#include "llvm/IR/Function.h"
76	#include "llvm/IR/IRBuilder.h"
77	#include "llvm/IR/Instructions.h"
78	#include "llvm/IR/IntrinsicInst.h"
79	#include "llvm/IR/LLVMContext.h"
80	#include "llvm/IR/Module.h"
81	#include "llvm/IR/PatternMatch.h"
82	#include "llvm/IR/Type.h"
83	#include "llvm/IR/Value.h"
84	#include "llvm/IR/ValueHandle.h"
85	#include "llvm/IR/Verifier.h"
86	#include "llvm/Pass.h"
87	#include "llvm/Support/BranchProbability.h"
88	#include "llvm/Support/CommandLine.h"
89	#include "llvm/Support/Debug.h"
90	#include "llvm/Support/raw_ostream.h"
91	#include "llvm/Transforms/Scalar.h"
92	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
93	#include "llvm/Transforms/Utils/Local.h"
94	#include "llvm/Transforms/Utils/VectorUtils.h"
95	#include <algorithm>
96	#include <map>
97	#include <tuple>
98
99	using namespace llvm;
100	using namespace llvm::PatternMatch;
101
102	#define LV_NAME"loop-vectorize" "loop-vectorize"
103	#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
104
105	STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = { "loop-vectorize", "Number of loops vectorized" , 0, 0 };
106	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = { "loop-vectorize", "Number of loops analyzed for vectorization" , 0, 0 };
107
108	static cl::opt<unsigned>
109	VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
110	cl::desc("Sets the SIMD width. Zero is autoselect."));
111
112	static cl::opt<unsigned>
113	VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
114	cl::desc("Sets the vectorization interleave count. "
115	"Zero is autoselect."));
116
117	static cl::opt<bool>
118	EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
119	cl::desc("Enable if-conversion during vectorization."));
120
121	/// We don't vectorize loops with a known constant trip count below this number.
122	static cl::opt<unsigned>
123	TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
124	cl::Hidden,
125	cl::desc("Don't vectorize loops with a constant "
126	"trip count that is smaller than this "
127	"value."));
128
129	/// This enables versioning on the strides of symbolically striding memory
130	/// accesses in code like the following.
131	/// for (i = 0; i < N; ++i)
132	/// A[i * Stride1] += B[i * Stride2] ...
133	///
134	/// Will be roughly translated to
135	/// if (Stride1 == 1 && Stride2 == 1) {
136	/// for (i = 0; i < N; i+=4)
137	/// A[i:i+3] += ...
138	/// } else
139	/// ...
140	static cl::opt<bool> EnableMemAccessVersioning(
141	"enable-mem-access-versioning", cl::init(true), cl::Hidden,
142	cl::desc("Enable symblic stride memory access versioning"));
143
144	/// We don't unroll loops with a known constant trip count below this number.
145	static const unsigned TinyTripCountUnrollThreshold = 128;
146
147	/// When performing memory disambiguation checks at runtime do not make more
148	/// than this number of comparisons.
149	static const unsigned RuntimeMemoryCheckThreshold = 8;
150
151	/// Maximum simd width.
152	static const unsigned MaxVectorWidth = 64;
153
154	static cl::opt<unsigned> ForceTargetNumScalarRegs(
155	"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
156	cl::desc("A flag that overrides the target's number of scalar registers."));
157
158	static cl::opt<unsigned> ForceTargetNumVectorRegs(
159	"force-target-num-vector-regs", cl::init(0), cl::Hidden,
160	cl::desc("A flag that overrides the target's number of vector registers."));
161
162	/// Maximum vectorization interleave count.
163	static const unsigned MaxInterleaveFactor = 16;
164
165	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
166	"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
167	cl::desc("A flag that overrides the target's max interleave factor for "
168	"scalar loops."));
169
170	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
171	"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
172	cl::desc("A flag that overrides the target's max interleave factor for "
173	"vectorized loops."));
174
175	static cl::opt<unsigned> ForceTargetInstructionCost(
176	"force-target-instruction-cost", cl::init(0), cl::Hidden,
177	cl::desc("A flag that overrides the target's expected cost for "
178	"an instruction to a single constant value. Mostly "
179	"useful for getting consistent testing."));
180
181	static cl::opt<unsigned> SmallLoopCost(
182	"small-loop-cost", cl::init(20), cl::Hidden,
183	cl::desc("The cost of a loop that is considered 'small' by the unroller."));
184
185	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
186	"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
187	cl::desc("Enable the use of the block frequency analysis to access PGO "
188	"heuristics minimizing code growth in cold regions and being more "
189	"aggressive in hot regions."));
190
191	// Runtime unroll loops for load/store throughput.
192	static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
193	"enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
194	cl::desc("Enable runtime unrolling until load/store ports are saturated"));
195
196	/// The number of stores in a loop that are allowed to need predication.
197	static cl::opt<unsigned> NumberOfStoresToPredicate(
198	"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
199	cl::desc("Max number of stores to be predicated behind an if."));
200
201	static cl::opt<bool> EnableIndVarRegisterHeur(
202	"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
203	cl::desc("Count the induction variable only once when unrolling"));
204
205	static cl::opt<bool> EnableCondStoresVectorization(
206	"enable-cond-stores-vec", cl::init(false), cl::Hidden,
207	cl::desc("Enable if predication of stores during vectorization."));
208
209	static cl::opt<unsigned> MaxNestedScalarReductionUF(
210	"max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
211	cl::desc("The maximum unroll factor to use when unrolling a scalar "
212	"reduction in a nested loop."));
213
214	namespace {
215
216	// Forward declarations.
217	class LoopVectorizationLegality;
218	class LoopVectorizationCostModel;
219	class LoopVectorizeHints;
220
221	/// Optimization analysis message produced during vectorization. Messages inform
222	/// the user why vectorization did not occur.
223	class Report {
224	std::string Message;
225	raw_string_ostream Out;
226	Instruction *Instr;
227
228	public:
229	Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
230	Out << "loop not vectorized: ";
231	}
232
233	template <typename A> Report &operator<<(const A &Value) {
234	Out << Value;
235	return *this;
236	}
237
238	Instruction *getInstr() { return Instr; }
239
240	std::string &str() { return Out.str(); }
241	operator Twine() { return Out.str(); }
242	};
243
244	/// InnerLoopVectorizer vectorizes loops which contain only one basic
245	/// block to a specified vectorization factor (VF).
246	/// This class performs the widening of scalars into vectors, or multiple
247	/// scalars. This class also implements the following features:
248	/// * It inserts an epilogue loop for handling loops that don't have iteration
249	/// counts that are known to be a multiple of the vectorization factor.
250	/// * It handles the code generation for reduction variables.
251	/// * Scalarization (implementation using scalars) of un-vectorizable
252	/// instructions.
253	/// InnerLoopVectorizer does not perform any vectorization-legality
254	/// checks, and relies on the caller to check for the different legality
255	/// aspects. The InnerLoopVectorizer relies on the
256	/// LoopVectorizationLegality class to provide information about the induction
257	/// and reduction variables that were found to a given vectorization factor.
258	class InnerLoopVectorizer {
259	public:
260	InnerLoopVectorizer(Loop OrigLoop, ScalarEvolution SE, LoopInfo *LI,
261	DominatorTree DT, const DataLayout DL,
262	const TargetLibraryInfo *TLI, unsigned VecWidth,
263	unsigned UnrollFactor)
264	: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
265	VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
266	Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
267	Legal(nullptr) {}
268
269	// Perform the actual loop widening (vectorization).
270	void vectorize(LoopVectorizationLegality *L) {
271	Legal = L;
272	// Create a new empty loop. Unlink the old loop and connect the new one.
273	createEmptyLoop();
274	// Widen each instruction in the old loop to a new one in the new loop.
275	// Use the Legality module to find the induction and reduction variables.
276	vectorizeLoop();
277	// Register the new loop and update the analysis passes.
278	updateAnalysis();
279	}
280
281	virtual ~InnerLoopVectorizer() {}
282
283	protected:
284	/// A small list of PHINodes.
285	typedef SmallVector<PHINode*, 4> PhiVector;
286	/// When we unroll loops we have multiple vector values for each scalar.
287	/// This data structure holds the unrolled and vectorized values that
288	/// originated from one scalar instruction.
289	typedef SmallVector<Value*, 2> VectorParts;
290
291	// When we if-convert we need create edge masks. We have to cache values so
292	// that we don't end up with exponential recursion/IR.
293	typedef DenseMap<std::pair<BasicBlock, BasicBlock>,
294	VectorParts> EdgeMaskCache;
295
296	/// \brief Add code that checks at runtime if the accessed arrays overlap.
297	///
298	/// Returns a pair of instructions where the first element is the first
299	/// instruction generated in possibly a sequence of instructions and the
300	/// second value is the final comparator value or NULL if no check is needed.
301	std::pair<Instruction , Instruction > addRuntimeCheck(Instruction *Loc);
302
303	/// \brief Add checks for strides that where assumed to be 1.
304	///
305	/// Returns the last check instruction and the first check instruction in the
306	/// pair as (first, last).
307	std::pair<Instruction , Instruction > addStrideCheck(Instruction *Loc);
308
309	/// Create an empty loop, based on the loop ranges of the old loop.
310	void createEmptyLoop();
311	/// Copy and widen the instructions from the old loop.
312	virtual void vectorizeLoop();
313
314	/// \brief The Loop exit block may have single value PHI nodes where the
315	/// incoming value is 'Undef'. While vectorizing we only handled real values
316	/// that were defined inside the loop. Here we fix the 'undef case'.
317	/// See PR14725.
318	void fixLCSSAPHIs();
319
320	/// A helper function that computes the predicate of the block BB, assuming
321	/// that the header block of the loop is set to True. It returns the entry
322	/// mask for the block BB.
323	VectorParts createBlockInMask(BasicBlock *BB);
324	/// A helper function that computes the predicate of the edge between SRC
325	/// and DST.
326	VectorParts createEdgeMask(BasicBlock Src, BasicBlock Dst);
327
328	/// A helper function to vectorize a single BB within the innermost loop.
329	void vectorizeBlockInLoop(BasicBlock BB, PhiVector PV);
330
331	/// Vectorize a single PHINode in a block. This method handles the induction
332	/// variable canonicalization. It supports both VF = 1 for unrolled loops and
333	/// arbitrary length vectors.
334	void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
335	unsigned UF, unsigned VF, PhiVector *PV);
336
337	/// Insert the new loop to the loop hierarchy and pass manager
338	/// and update the analysis passes.
339	void updateAnalysis();
340
341	/// This instruction is un-vectorizable. Implement it as a sequence
342	/// of scalars. If \p IfPredicateStore is true we need to 'hide' each
343	/// scalarized instruction behind an if block predicated on the control
344	/// dependence of the instruction.
345	virtual void scalarizeInstruction(Instruction *Instr,
346	bool IfPredicateStore=false);
347
348	/// Vectorize Load and Store instructions,
349	virtual void vectorizeMemoryInstruction(Instruction *Instr);
350
351	/// Create a broadcast instruction. This method generates a broadcast
352	/// instruction (shuffle) for loop invariant values and for the induction
353	/// value. If this is the induction variable then we extend it to N, N+1, ...
354	/// this is needed because each iteration in the loop corresponds to a SIMD
355	/// element.
356	virtual Value getBroadcastInstrs(Value V);
357
358	/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
359	/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
360	/// The sequence starts at StartIndex.
361	virtual Value getConsecutiveVector(Value Val, int StartIdx, bool Negate);
362
363	/// When we go over instructions in the basic block we rely on previous
364	/// values within the current basic block or on loop invariant values.
365	/// When we widen (vectorize) values we place them in the map. If the values
366	/// are not within the map, they have to be loop invariant, so we simply
367	/// broadcast them into a vector.
368	VectorParts &getVectorValue(Value *V);
369
370	/// Generate a shuffle sequence that will reverse the vector Vec.
371	virtual Value reverseVector(Value Vec);
372
373	/// This is a helper class that holds the vectorizer state. It maps scalar
374	/// instructions to vector instructions. When the code is 'unrolled' then
375	/// then a single scalar value is mapped to multiple vector parts. The parts
376	/// are stored in the VectorPart type.
377	struct ValueMap {
378	/// C'tor. UnrollFactor controls the number of vectors ('parts') that
379	/// are mapped.
380	ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
381
382	/// \return True if 'Key' is saved in the Value Map.
383	bool has(Value *Key) const { return MapStorage.count(Key); }
384
385	/// Initializes a new entry in the map. Sets all of the vector parts to the
386	/// save value in 'Val'.
387	/// \return A reference to a vector with splat values.
388	VectorParts &splat(Value Key, Value Val) {
389	VectorParts &Entry = MapStorage[Key];
390	Entry.assign(UF, Val);
391	return Entry;
392	}
393
394	///\return A reference to the value that is stored at 'Key'.
395	VectorParts &get(Value *Key) {
396	VectorParts &Entry = MapStorage[Key];
397	if (Entry.empty())
398	Entry.resize(UF);
399	assert(Entry.size() == UF)((Entry.size() == UF) ? static_cast<void> (0) : __assert_fail ("Entry.size() == UF", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 399, __PRETTY_FUNCTION__));
400	return Entry;
401	}
402
403	private:
404	/// The unroll factor. Each entry in the map stores this number of vector
405	/// elements.
406	unsigned UF;
407
408	/// Map storage. We use std::map and not DenseMap because insertions to a
409	/// dense map invalidates its iterators.
410	std::map<Value *, VectorParts> MapStorage;
411	};
412
413	/// The original loop.
414	Loop *OrigLoop;
415	/// Scev analysis to use.
416	ScalarEvolution *SE;
417	/// Loop Info.
418	LoopInfo *LI;
419	/// Dominator Tree.
420	DominatorTree *DT;
421	/// Alias Analysis.
422	AliasAnalysis *AA;
423	/// Data Layout.
424	const DataLayout *DL;
425	/// Target Library Info.
426	const TargetLibraryInfo *TLI;
427
428	/// The vectorization SIMD factor to use. Each vector will have this many
429	/// vector elements.
430	unsigned VF;
431
432	protected:
433	/// The vectorization unroll factor to use. Each scalar is vectorized to this
434	/// many different vector instructions.
435	unsigned UF;
436
437	/// The builder that we use
438	IRBuilder<> Builder;
439
440	// --- Vectorization state ---
441
442	/// The vector-loop preheader.
443	BasicBlock *LoopVectorPreHeader;
444	/// The scalar-loop preheader.
445	BasicBlock *LoopScalarPreHeader;
446	/// Middle Block between the vector and the scalar.
447	BasicBlock *LoopMiddleBlock;
448	///The ExitBlock of the scalar loop.
449	BasicBlock *LoopExitBlock;
450	///The vector loop body.
451	SmallVector<BasicBlock *, 4> LoopVectorBody;
452	///The scalar loop body.
453	BasicBlock *LoopScalarBody;
454	/// A list of all bypass blocks. The first block is the entry of the loop.
455	SmallVector<BasicBlock *, 4> LoopBypassBlocks;
456
457	/// The new Induction variable which was added to the new block.
458	PHINode *Induction;
459	/// The induction variable of the old basic block.
460	PHINode *OldInduction;
461	/// Holds the extended (to the widest induction type) start index.
462	Value *ExtendedIdx;
463	/// Maps scalars to widened vectors.
464	ValueMap WidenMap;
465	EdgeMaskCache MaskCache;
466
467	LoopVectorizationLegality *Legal;
468	};
469
470	class InnerLoopUnroller : public InnerLoopVectorizer {
471	public:
472	InnerLoopUnroller(Loop OrigLoop, ScalarEvolution SE, LoopInfo *LI,
473	DominatorTree DT, const DataLayout DL,
474	const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
475	InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
476
477	private:
478	void scalarizeInstruction(Instruction *Instr,
479	bool IfPredicateStore = false) override;
480	void vectorizeMemoryInstruction(Instruction *Instr) override;
481	Value getBroadcastInstrs(Value V) override;
482	Value getConsecutiveVector(Value Val, int StartIdx, bool Negate) override;
483	Value reverseVector(Value Vec) override;
484	};
485
486	/// \brief Look for a meaningful debug location on the instruction or it's
487	/// operands.
488	static Instruction getDebugLocFromInstOrOperands(Instruction I) {
489	if (!I)
490	return I;
491
492	DebugLoc Empty;
493	if (I->getDebugLoc() != Empty)
494	return I;
495
496	for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
497	if (Instruction OpInst = dyn_cast<Instruction>(OI))
498	if (OpInst->getDebugLoc() != Empty)
499	return OpInst;
500	}
501
502	return I;
503	}
504
505	/// \brief Set the debug location in the builder using the debug location in the
506	/// instruction.
507	static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
508	if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
509	B.SetCurrentDebugLocation(Inst->getDebugLoc());
510	else
511	B.SetCurrentDebugLocation(DebugLoc());
512	}
513
514	#ifndef NDEBUG
515	/// \return string containing a file name and a line # for the given loop.
516	static std::string getDebugLocString(const Loop *L) {
517	std::string Result;
518	if (L) {
519	raw_string_ostream OS(Result);
520	const DebugLoc LoopDbgLoc = L->getStartLoc();
521	if (!LoopDbgLoc.isUnknown())
522	LoopDbgLoc.print(L->getHeader()->getContext(), OS);
523	else
524	// Just print the module name.
525	OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
526	OS.flush();
527	}
528	return Result;
529	}
530	#endif
531
532	/// \brief Propagate known metadata from one instruction to another.
533	static void propagateMetadata(Instruction To, const Instruction From) {
534	SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
535	From->getAllMetadataOtherThanDebugLoc(Metadata);
536
537	for (auto M : Metadata) {
538	unsigned Kind = M.first;
539
540	// These are safe to transfer (this is safe for TBAA, even when we
541	// if-convert, because should that metadata have had a control dependency
542	// on the condition, and thus actually aliased with some other
543	// non-speculated memory access when the condition was false, this would be
544	// caught by the runtime overlap checks).
545	if (Kind != LLVMContext::MD_tbaa &&
546	Kind != LLVMContext::MD_alias_scope &&
547	Kind != LLVMContext::MD_noalias &&
548	Kind != LLVMContext::MD_fpmath)
549	continue;
550
551	To->setMetadata(Kind, M.second);
552	}
553	}
554
555	/// \brief Propagate known metadata from one instruction to a vector of others.
556	static void propagateMetadata(SmallVectorImpl<Value > &To, const Instruction From) {
557	for (Value *V : To)
558	if (Instruction *I = dyn_cast<Instruction>(V))
559	propagateMetadata(I, From);
560	}
561
562	/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
563	/// to what vectorization factor.
564	/// This class does not look at the profitability of vectorization, only the
565	/// legality. This class has two main kinds of checks:
566	/// * Memory checks - The code in canVectorizeMemory checks if vectorization
567	/// will change the order of memory accesses in a way that will change the
568	/// correctness of the program.
569	/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
570	/// checks for a number of different conditions, such as the availability of a
571	/// single induction variable, that all types are supported and vectorize-able,
572	/// etc. This code reflects the capabilities of InnerLoopVectorizer.
573	/// This class is also used by InnerLoopVectorizer for identifying
574	/// induction variable and the different reduction variables.
575	class LoopVectorizationLegality {
576	public:
577	unsigned NumLoads;
578	unsigned NumStores;
579	unsigned NumPredStores;
580
581	LoopVectorizationLegality(Loop L, ScalarEvolution SE, const DataLayout *DL,
582	DominatorTree DT, TargetLibraryInfo TLI,
583	AliasAnalysis AA, Function F)
584	: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
585	DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
586	WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
587	}
588
589	/// This enum represents the kinds of reductions that we support.
590	enum ReductionKind {
591	RK_NoReduction, ///< Not a reduction.
592	RK_IntegerAdd, ///< Sum of integers.
593	RK_IntegerMult, ///< Product of integers.
594	RK_IntegerOr, ///< Bitwise or logical OR of numbers.
595	RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
596	RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
597	RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
598	RK_FloatAdd, ///< Sum of floats.
599	RK_FloatMult, ///< Product of floats.
600	RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()).
601	};
602
603	/// This enum represents the kinds of inductions that we support.
604	enum InductionKind {
605	IK_NoInduction, ///< Not an induction variable.
606	IK_IntInduction, ///< Integer induction variable. Step = 1.
607	IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
608	IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem).
609	IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
610	};
611
612	// This enum represents the kind of minmax reduction.
613	enum MinMaxReductionKind {
614	MRK_Invalid,
615	MRK_UIntMin,
616	MRK_UIntMax,
617	MRK_SIntMin,
618	MRK_SIntMax,
619	MRK_FloatMin,
620	MRK_FloatMax
621	};
622
623	/// This struct holds information about reduction variables.
624	struct ReductionDescriptor {
625	ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
626	Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
627
628	ReductionDescriptor(Value Start, Instruction Exit, ReductionKind K,
629	MinMaxReductionKind MK)
630	: StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
631
632	// The starting value of the reduction.
633	// It does not have to be zero!
634	TrackingVH<Value> StartValue;
635	// The instruction who's value is used outside the loop.
636	Instruction *LoopExitInstr;
637	// The kind of the reduction.
638	ReductionKind Kind;
639	// If this a min/max reduction the kind of reduction.
640	MinMaxReductionKind MinMaxKind;
641	};
642
643	/// This POD struct holds information about a potential reduction operation.
644	struct ReductionInstDesc {
645	ReductionInstDesc(bool IsRedux, Instruction *I) :
646	IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
647
648	ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
649	IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
650
651	// Is this instruction a reduction candidate.
652	bool IsReduction;
653	// The last instruction in a min/max pattern (select of the select(icmp())
654	// pattern), or the current reduction instruction otherwise.
655	Instruction *PatternLastInst;
656	// If this is a min/max pattern the comparison predicate.
657	MinMaxReductionKind MinMaxKind;
658	};
659
660	/// This struct holds information about the memory runtime legality
661	/// check that a group of pointers do not overlap.
662	struct RuntimePointerCheck {
663	RuntimePointerCheck() : Need(false) {}
664
665	/// Reset the state of the pointer runtime information.
666	void reset() {
667	Need = false;
668	Pointers.clear();
669	Starts.clear();
670	Ends.clear();
671	IsWritePtr.clear();
672	DependencySetId.clear();
673	AliasSetId.clear();
674	}
675
676	/// Insert a pointer and calculate the start and end SCEVs.
677	void insert(ScalarEvolution SE, Loop Lp, Value *Ptr, bool WritePtr,
678	unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
679
680	/// This flag indicates if we need to add the runtime check.
681	bool Need;
682	/// Holds the pointers that we need to check.
683	SmallVector<TrackingVH<Value>, 2> Pointers;
684	/// Holds the pointer value at the beginning of the loop.
685	SmallVector<const SCEV*, 2> Starts;
686	/// Holds the pointer value at the end of the loop.
687	SmallVector<const SCEV*, 2> Ends;
688	/// Holds the information if this pointer is used for writing to memory.
689	SmallVector<bool, 2> IsWritePtr;
690	/// Holds the id of the set of pointers that could be dependent because of a
691	/// shared underlying object.
692	SmallVector<unsigned, 2> DependencySetId;
693	/// Holds the id of the disjoint alias set to which this pointer belongs.
694	SmallVector<unsigned, 2> AliasSetId;
695	};
696
697	/// A struct for saving information about induction variables.
698	struct InductionInfo {
699	InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
700	InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
701	/// Start value.
702	TrackingVH<Value> StartValue;
703	/// Induction kind.
704	InductionKind IK;
705	};
706
707	/// ReductionList contains the reduction descriptors for all
708	/// of the reductions that were found in the loop.
709	typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
710
711	/// InductionList saves induction variables and maps them to the
712	/// induction descriptor.
713	typedef MapVector<PHINode*, InductionInfo> InductionList;
714
715	/// Returns true if it is legal to vectorize this loop.
716	/// This does not mean that it is profitable to vectorize this
717	/// loop, only that it is legal to do so.
718	bool canVectorize();
719
720	/// Returns the Induction variable.
721	PHINode *getInduction() { return Induction; }
722
723	/// Returns the reduction variables found in the loop.
724	ReductionList *getReductionVars() { return &Reductions; }
725
726	/// Returns the induction variables found in the loop.
727	InductionList *getInductionVars() { return &Inductions; }
728
729	/// Returns the widest induction type.
730	Type *getWidestInductionType() { return WidestIndTy; }
731
732	/// Returns True if V is an induction variable in this loop.
733	bool isInductionVariable(const Value *V);
734
735	/// Return true if the block BB needs to be predicated in order for the loop
736	/// to be vectorized.
737	bool blockNeedsPredication(BasicBlock *BB);
738
739	/// Check if this pointer is consecutive when vectorizing. This happens
740	/// when the last index of the GEP is the induction variable, or that the
741	/// pointer itself is an induction variable.
742	/// This check allows us to vectorize A[idx] into a wide load/store.
743	/// Returns:
744	/// 0 - Stride is unknown or non-consecutive.
745	/// 1 - Address is consecutive.
746	/// -1 - Address is consecutive, and decreasing.
747	int isConsecutivePtr(Value *Ptr);
748
749	/// Returns true if the value V is uniform within the loop.
750	bool isUniform(Value *V);
751
752	/// Returns true if this instruction will remain scalar after vectorization.
753	bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
754
755	/// Returns the information that we collected about runtime memory check.
756	RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
757
758	/// This function returns the identity element (or neutral element) for
759	/// the operation K.
760	static Constant getReductionIdentity(ReductionKind K, Type Tp);
761
762	unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
763
764	bool hasStride(Value *V) { return StrideSet.count(V); }
765	bool mustCheckStrides() { return !StrideSet.empty(); }
766	SmallPtrSet<Value *, 8>::iterator strides_begin() {
767	return StrideSet.begin();
768	}
769	SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
770
771	private:
772	/// Check if a single basic block loop is vectorizable.
773	/// At this point we know that this is a loop with a constant trip count
774	/// and we only need to check individual instructions.
775	bool canVectorizeInstrs();
776
777	/// When we vectorize loops we may change the order in which
778	/// we read and write from memory. This method checks if it is
779	/// legal to vectorize the code, considering only memory constrains.
780	/// Returns true if the loop is vectorizable
781	bool canVectorizeMemory();
782
783	/// Return true if we can vectorize this loop using the IF-conversion
784	/// transformation.
785	bool canVectorizeWithIfConvert();
786
787	/// Collect the variables that need to stay uniform after vectorization.
788	void collectLoopUniforms();
789
790	/// Return true if all of the instructions in the block can be speculatively
791	/// executed. \p SafePtrs is a list of addresses that are known to be legal
792	/// and we know that we can read from them without segfault.
793	bool blockCanBePredicated(BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs);
794
795	/// Returns True, if 'Phi' is the kind of reduction variable for type
796	/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
797	bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
798	/// Returns a struct describing if the instruction 'I' can be a reduction
799	/// variable of type 'Kind'. If the reduction is a min/max pattern of
800	/// select(icmp()) this function advances the instruction pointer 'I' from the
801	/// compare instruction to the select instruction and stores this pointer in
802	/// 'PatternLastInst' member of the returned struct.
803	ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
804	ReductionInstDesc &Desc);
805	/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
806	/// pattern corresponding to a min(X, Y) or max(X, Y).
807	static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
808	ReductionInstDesc &Prev);
809	/// Returns the induction kind of Phi. This function may return NoInduction
810	/// if the PHI is not an induction variable.
811	InductionKind isInductionVariable(PHINode *Phi);
812
813	/// \brief Collect memory access with loop invariant strides.
814	///
815	/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
816	/// invariant.
817	void collectStridedAcccess(Value *LoadOrStoreInst);
818
819	/// Report an analysis message to assist the user in diagnosing loops that are
820	/// not vectorized.
821	void emitAnalysis(Report &Message) {
822	DebugLoc DL = TheLoop->getStartLoc();
823	if (Instruction *I = Message.getInstr())
824	DL = I->getDebugLoc();
825	emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize",
826	*TheFunction, DL, Message.str());
827	}
828
829	/// The loop that we evaluate.
830	Loop *TheLoop;
831	/// Scev analysis.
832	ScalarEvolution *SE;
833	/// DataLayout analysis.
834	const DataLayout *DL;
835	/// Dominators.
836	DominatorTree *DT;
837	/// Target Library Info.
838	TargetLibraryInfo *TLI;
839	/// Alias analysis.
840	AliasAnalysis *AA;
841	/// Parent function
842	Function *TheFunction;
843
844	// --- vectorization state --- //
845
846	/// Holds the integer induction variable. This is the counter of the
847	/// loop.
848	PHINode *Induction;
849	/// Holds the reduction variables.
850	ReductionList Reductions;
851	/// Holds all of the induction variables that we found in the loop.
852	/// Notice that inductions don't need to start at zero and that induction
853	/// variables can be pointers.
854	InductionList Inductions;
855	/// Holds the widest induction type encountered.
856	Type *WidestIndTy;
857
858	/// Allowed outside users. This holds the reduction
859	/// vars which can be accessed from outside the loop.
860	SmallPtrSet<Value*, 4> AllowedExit;
861	/// This set holds the variables which are known to be uniform after
862	/// vectorization.
863	SmallPtrSet<Instruction*, 4> Uniforms;
864	/// We need to check that all of the pointers in this list are disjoint
865	/// at runtime.
866	RuntimePointerCheck PtrRtCheck;
867	/// Can we assume the absence of NaNs.
868	bool HasFunNoNaNAttr;
869
870	unsigned MaxSafeDepDistBytes;
871
872	ValueToValueMap Strides;
873	SmallPtrSet<Value *, 8> StrideSet;
874	};
875
876	/// LoopVectorizationCostModel - estimates the expected speedups due to
877	/// vectorization.
878	/// In many cases vectorization is not profitable. This can happen because of
879	/// a number of reasons. In this class we mainly attempt to predict the
880	/// expected speedup/slowdowns due to the supported instruction set. We use the
881	/// TargetTransformInfo to query the different backends for the cost of
882	/// different operations.
883	class LoopVectorizationCostModel {
884	public:
885	LoopVectorizationCostModel(Loop L, ScalarEvolution SE, LoopInfo *LI,
886	LoopVectorizationLegality *Legal,
887	const TargetTransformInfo &TTI,
888	const DataLayout DL, const TargetLibraryInfo TLI,
889	AssumptionTracker AT, const Function F,
890	const LoopVectorizeHints *Hints)
891	: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
892	TheFunction(F), Hints(Hints) {
893	CodeMetrics::collectEphemeralValues(L, AT, EphValues);
894	}
895
896	/// Information about vectorization costs
897	struct VectorizationFactor {
898	unsigned Width; // Vector width with best cost
899	unsigned Cost; // Cost of the loop with that width
900	};
901	/// \return The most profitable vectorization factor and the cost of that VF.
902	/// This method checks every power of two up to VF. If UserVF is not ZERO
903	/// then this vectorization factor will be selected if vectorization is
904	/// possible.
905	VectorizationFactor selectVectorizationFactor(bool OptForSize);
906
907	/// \return The size (in bits) of the widest type in the code that
908	/// needs to be vectorized. We ignore values that remain scalar such as
909	/// 64 bit loop indices.
910	unsigned getWidestType();
911
912	/// \return The most profitable unroll factor.
913	/// If UserUF is non-zero then this method finds the best unroll-factor
914	/// based on register pressure and other parameters.
915	/// VF and LoopCost are the selected vectorization factor and the cost of the
916	/// selected VF.
917	unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
918
919	/// \brief A struct that represents some properties of the register usage
920	/// of a loop.
921	struct RegisterUsage {
922	/// Holds the number of loop invariant values that are used in the loop.
923	unsigned LoopInvariantRegs;
924	/// Holds the maximum number of concurrent live intervals in the loop.
925	unsigned MaxLocalUsers;
926	/// Holds the number of instructions in the loop.
927	unsigned NumInstructions;
928	};
929
930	/// \return information about the register usage of the loop.
931	RegisterUsage calculateRegisterUsage();
932
933	private:
934	/// Returns the expected execution cost. The unit of the cost does
935	/// not matter because we use the 'cost' units to compare different
936	/// vector widths. The cost that is returned is not normalized by
937	/// the factor width.
938	unsigned expectedCost(unsigned VF);
939
940	/// Returns the execution time cost of an instruction for a given vector
941	/// width. Vector width of one means scalar.
942	unsigned getInstructionCost(Instruction *I, unsigned VF);
943
944	/// A helper function for converting Scalar types to vector types.
945	/// If the incoming type is void, we return void. If the VF is 1, we return
946	/// the scalar type.
947	static Type* ToVectorTy(Type *Scalar, unsigned VF);
948
949	/// Returns whether the instruction is a load or store and will be a emitted
950	/// as a vector operation.
951	bool isConsecutiveLoadOrStore(Instruction *I);
952
953	/// Report an analysis message to assist the user in diagnosing loops that are
954	/// not vectorized.
955	void emitAnalysis(Report &Message) {
956	DebugLoc DL = TheLoop->getStartLoc();
957	if (Instruction *I = Message.getInstr())
958	DL = I->getDebugLoc();
959	emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize",
960	*TheFunction, DL, Message.str());
961	}
962
963	/// Values used only by @llvm.assume calls.
964	SmallPtrSet<const Value *, 32> EphValues;
965
966	/// The loop that we evaluate.
967	Loop *TheLoop;
968	/// Scev analysis.
969	ScalarEvolution *SE;
970	/// Loop Info analysis.
971	LoopInfo *LI;
972	/// Vectorization legality.
973	LoopVectorizationLegality *Legal;
974	/// Vector target information.
975	const TargetTransformInfo &TTI;
976	/// Target data layout information.
977	const DataLayout *DL;
978	/// Target Library Info.
979	const TargetLibraryInfo *TLI;
980	const Function *TheFunction;
981	// Loop Vectorize Hint.
982	const LoopVectorizeHints *Hints;
983	};
984
985	/// Utility class for getting and setting loop vectorizer hints in the form
986	/// of loop metadata.
987	/// This class keeps a number of loop annotations locally (as member variables)
988	/// and can, upon request, write them back as metadata on the loop. It will
989	/// initially scan the loop for existing metadata, and will update the local
990	/// values based on information in the loop.
991	/// We cannot write all values to metadata, as the mere presence of some info,
992	/// for example 'force', means a decision has been made. So, we need to be
993	/// careful NOT to add them if the user hasn't specifically asked so.
994	class LoopVectorizeHints {
995	enum HintKind {
996	HK_WIDTH,
997	HK_UNROLL,
998	HK_FORCE
999	};
1000
1001	/// Hint - associates name and validation with the hint value.
1002	struct Hint {
1003	const char * Name;
1004	unsigned Value; // This may have to change for non-numeric values.
1005	HintKind Kind;
1006
1007	Hint(const char * Name, unsigned Value, HintKind Kind)
1008	: Name(Name), Value(Value), Kind(Kind) { }
1009
1010	bool validate(unsigned Val) {
1011	switch (Kind) {
1012	case HK_WIDTH:
1013	return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
1014	case HK_UNROLL:
1015	return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1016	case HK_FORCE:
1017	return (Val <= 1);
1018	}
1019	return false;
1020	}
1021	};
1022
1023	/// Vectorization width.
1024	Hint Width;
1025	/// Vectorization interleave factor.
1026	Hint Interleave;
1027	/// Vectorization forced
1028	Hint Force;
1029
1030	/// Return the loop metadata prefix.
1031	static StringRef Prefix() { return "llvm.loop."; }
1032
1033	public:
1034	enum ForceKind {
1035	FK_Undefined = -1, ///< Not selected.
1036	FK_Disabled = 0, ///< Forcing disabled.
1037	FK_Enabled = 1, ///< Forcing enabled.
1038	};
1039
1040	LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
1041	: Width("vectorize.width", VectorizationFactor, HK_WIDTH),
1042	Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1043	Force("vectorize.enable", FK_Undefined, HK_FORCE),
1044	TheLoop(L) {
1045	// Populate values with existing loop metadata.
1046	getHintsFromMetadata();
1047
1048	// force-vector-interleave overrides DisableInterleaving.
1049	if (VectorizationInterleave.getNumOccurrences() > 0)
1050	Interleave.Value = VectorizationInterleave;
1051
1052	DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0)
1053	<< "LV: Interleaving disabled by the pass manager\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0);
1054	}
1055
1056	/// Mark the loop L as already vectorized by setting the width to 1.
1057	void setAlreadyVectorized() {
1058	Width.Value = Interleave.Value = 1;
1059	Hint Hints[] = {Width, Interleave};
1060	writeHintsToMetadata(Hints);
1061	}
1062
1063	/// Dumps all the hint information.
1064	std::string emitRemark() const {
1065	Report R;
1066	if (Force.Value == LoopVectorizeHints::FK_Disabled)
1067	R << "vectorization is explicitly disabled";
1068	else {
1069	R << "use -Rpass-analysis=loop-vectorize for more info";
1070	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1071	R << " (Force=true";
1072	if (Width.Value != 0)
1073	R << ", Vector Width=" << Width.Value;
1074	if (Interleave.Value != 0)
1075	R << ", Interleave Count=" << Interleave.Value;
1076	R << ")";
1077	}
1078	}
1079
1080	return R.str();
1081	}
1082
1083	unsigned getWidth() const { return Width.Value; }
1084	unsigned getInterleave() const { return Interleave.Value; }
1085	enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1086
1087	private:
1088	/// Find hints specified in the loop metadata and update local values.
1089	void getHintsFromMetadata() {
1090	MDNode *LoopID = TheLoop->getLoopID();
1091	if (!LoopID)
1092	return;
1093
1094	// First operand should refer to the loop id itself.
1095	assert(LoopID->getNumOperands() > 0 && "requires at least one operand")((LoopID->getNumOperands() > 0 && "requires at least one operand" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getNumOperands() > 0 && \"requires at least one operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1095, __PRETTY_FUNCTION__));
1096	assert(LoopID->getOperand(0) == LoopID && "invalid loop id")((LoopID->getOperand(0) == LoopID && "invalid loop id" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getOperand(0) == LoopID && \"invalid loop id\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1096, __PRETTY_FUNCTION__));
1097
1098	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1099	const MDString *S = nullptr;
1100	SmallVector<Value*, 4> Args;
1101
1102	// The expected hint is either a MDString or a MDNode with the first
1103	// operand a MDString.
1104	if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1105	if (!MD \|\| MD->getNumOperands() == 0)
1106	continue;
1107	S = dyn_cast<MDString>(MD->getOperand(0));
1108	for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1109	Args.push_back(MD->getOperand(i));
1110	} else {
1111	S = dyn_cast<MDString>(LoopID->getOperand(i));
1112	assert(Args.size() == 0 && "too many arguments for MDString")((Args.size() == 0 && "too many arguments for MDString" ) ? static_cast<void> (0) : __assert_fail ("Args.size() == 0 && \"too many arguments for MDString\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1112, __PRETTY_FUNCTION__));
1113	}
1114
1115	if (!S)
1116	continue;
1117
1118	// Check if the hint starts with the loop metadata prefix.
1119	StringRef Name = S->getString();
1120	if (Args.size() == 1)
1121	setHint(Name, Args[0]);
1122	}
1123	}
1124
1125	/// Checks string hint with one operand and set value if valid.
1126	void setHint(StringRef Name, Value *Arg) {
1127	if (!Name.startswith(Prefix()))
1128	return;
1129	Name = Name.substr(Prefix().size(), StringRef::npos);
1130
1131	const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
1132	if (!C) return;
1133	unsigned Val = C->getZExtValue();
1134
1135	Hint *Hints[] = {&Width, &Interleave, &Force};
1136	for (auto H : Hints) {
1137	if (Name == H->Name) {
1138	if (H->validate(Val))
1139	H->Value = Val;
1140	else
1141	DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"; } } while (0);
1142	break;
1143	}
1144	}
1145	}
1146
1147	/// Create a new hint from name / value pair.
1148	MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1149	LLVMContext &Context = TheLoop->getHeader()->getContext();
1150	Value *Vals[] = {MDString::get(Context, Name),
1151	ConstantInt::get(Type::getInt32Ty(Context), V)};
1152	return MDNode::get(Context, Vals);
1153	}
1154
1155	/// Matches metadata with hint name.
1156	bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1157	MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
1158	if (!Name)
1159	return false;
1160
1161	for (auto H : HintTypes)
1162	if (Name->getString().endswith(H.Name))
1163	return true;
1164	return false;
1165	}
1166
1167	/// Sets current hints into loop metadata, keeping other values intact.
1168	void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1169	if (HintTypes.size() == 0)
1170	return;
1171
1172	// Reserve the first element to LoopID (see below).
1173	SmallVector<Value*, 4> Vals(1);
1174	// If the loop already has metadata, then ignore the existing operands.
1175	MDNode *LoopID = TheLoop->getLoopID();
1176	if (LoopID) {
1177	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1178	MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1179	// If node in update list, ignore old value.
1180	if (!matchesHintMetadataName(Node, HintTypes))
1181	Vals.push_back(Node);
1182	}
1183	}
1184
1185	// Now, add the missing hints.
1186	for (auto H : HintTypes)
1187	Vals.push_back(
1188	createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1189
1190	// Replace current metadata node with new one.
1191	LLVMContext &Context = TheLoop->getHeader()->getContext();
1192	MDNode *NewLoopID = MDNode::get(Context, Vals);
1193	// Set operand 0 to refer to the loop id itself.
1194	NewLoopID->replaceOperandWith(0, NewLoopID);
1195
1196	TheLoop->setLoopID(NewLoopID);
1197	if (LoopID)
1198	LoopID->replaceAllUsesWith(NewLoopID);
1199	LoopID = NewLoopID;
	Value stored to 'LoopID' is never read
1200	}
1201
1202	/// The loop these hints belong to.
1203	const Loop *TheLoop;
1204	};
1205
1206	static void emitMissedWarning(Function F, Loop L,
1207	const LoopVectorizeHints &LH) {
1208	emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1209	L->getStartLoc(), LH.emitRemark());
1210
1211	if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1212	if (LH.getWidth() != 1)
1213	emitLoopVectorizeWarning(
1214	F->getContext(), *F, L->getStartLoc(),
1215	"failed explicitly specified loop vectorization");
1216	else if (LH.getInterleave() != 1)
1217	emitLoopInterleaveWarning(
1218	F->getContext(), *F, L->getStartLoc(),
1219	"failed explicitly specified loop interleaving");
1220	}
1221	}
1222
1223	static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
1224	if (L.empty())
1225	return V.push_back(&L);
1226
1227	for (Loop *InnerL : L)
1228	addInnerLoop(*InnerL, V);
1229	}
1230
1231	/// The LoopVectorize Pass.
1232	struct LoopVectorize : public FunctionPass {
1233	/// Pass identification, replacement for typeid
1234	static char ID;
1235
1236	explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1237	: FunctionPass(ID),
1238	DisableUnrolling(NoUnrolling),
1239	AlwaysVectorize(AlwaysVectorize) {
1240	initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1241	}
1242
1243	ScalarEvolution *SE;
1244	const DataLayout *DL;
1245	LoopInfo *LI;
1246	TargetTransformInfo *TTI;
1247	DominatorTree *DT;
1248	BlockFrequencyInfo *BFI;
1249	TargetLibraryInfo *TLI;
1250	AliasAnalysis *AA;
1251	AssumptionTracker *AT;
1252	bool DisableUnrolling;
1253	bool AlwaysVectorize;
1254
1255	BlockFrequency ColdEntryFreq;
1256
1257	bool runOnFunction(Function &F) override {
1258	SE = &getAnalysis<ScalarEvolution>();
1259	DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
1260	DL = DLP ? &DLP->getDataLayout() : nullptr;
1261	LI = &getAnalysis<LoopInfo>();
1262	TTI = &getAnalysis<TargetTransformInfo>();
1263	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1264	BFI = &getAnalysis<BlockFrequencyInfo>();
1265	TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
1266	AA = &getAnalysis<AliasAnalysis>();
1267	AT = &getAnalysis<AssumptionTracker>();
1268
1269	// Compute some weights outside of the loop over the loops. Compute this
1270	// using a BranchProbability to re-use its scaling math.
1271	const BranchProbability ColdProb(1, 5); // 20%
1272	ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
1273
1274	// If the target claims to have no vector registers don't attempt
1275	// vectorization.
1276	if (!TTI->getNumberOfRegisters(true))
1277	return false;
1278
1279	if (!DL) {
1280	DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0)
1281	<< ": Missing data layout\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0);
1282	return false;
1283	}
1284
1285	// Build up a worklist of inner-loops to vectorize. This is necessary as
1286	// the act of vectorizing or partially unrolling a loop creates new loops
1287	// and can invalidate iterators across the loops.
1288	SmallVector<Loop *, 8> Worklist;
1289
1290	for (Loop L : LI)
1291	addInnerLoop(*L, Worklist);
1292
1293	LoopsAnalyzed += Worklist.size();
1294
1295	// Now walk the identified inner loops.
1296	bool Changed = false;
1297	while (!Worklist.empty())
1298	Changed \|= processLoop(Worklist.pop_back_val());
1299
1300	// Process each loop nest in the function.
1301	return Changed;
1302	}
1303
1304	bool processLoop(Loop *L) {
1305	assert(L->empty() && "Only process inner loops.")((L->empty() && "Only process inner loops.") ? static_cast <void> (0) : __assert_fail ("L->empty() && \"Only process inner loops.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1305, __PRETTY_FUNCTION__));
1306
1307	#ifndef NDEBUG
1308	const std::string DebugLocStr = getDebugLocString(L);
1309	#endif /* NDEBUG */
1310
1311	DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0)
1312	<< L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0)
1313	<< DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0);
1314
1315	LoopVectorizeHints Hints(L, DisableUnrolling);
1316
1317	DEBUG(dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1318	<< " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1319	<< (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1320	? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1321	: (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1322	? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1323	: "?")) << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1324	<< " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0);
1325
1326	// Function containing loop
1327	Function *F = L->getHeader()->getParent();
1328
1329	// Looking at the diagnostic output is the only way to determine if a loop
1330	// was vectorized (other than looking at the IR or machine code), so it
1331	// is important to generate an optimization remark for each loop. Most of
1332	// these messages are generated by emitOptimizationRemarkAnalysis. Remarks
1333	// generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
1334	// less verbose reporting vectorized loops and unvectorized loops that may
1335	// benefit from vectorization, respectively.
1336
1337	if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
1338	DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n" ; } } while (0);
1339	emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1340	L->getStartLoc(), Hints.emitRemark());
1341	return false;
1342	}
1343
1344	if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
1345	DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n" ; } } while (0);
1346	emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1347	L->getStartLoc(), Hints.emitRemark());
1348	return false;
1349	}
1350
1351	if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
1352	DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n" ; } } while (0);
1353	emitOptimizationRemarkAnalysis(
1354	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1355	"loop not vectorized: vector width and interleave count are "
1356	"explicitly set to 1");
1357	return false;
1358	}
1359
1360	// Check the loop for a trip count threshold:
1361	// do not vectorize loops with a tiny trip count.
1362	const unsigned TC = SE->getSmallConstantTripCount(L);
1363	if (TC > 0u && TC < TinyTripCountVectorThreshold) {
1364	DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 )
1365	<< "This loop is not worth vectorizing.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 );
1366	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1367	DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n" ; } } while (0);
1368	else {
1369	DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\n"; } } while (0);
1370	emitOptimizationRemarkAnalysis(
1371	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1372	"vectorization is not beneficial and is not explicitly forced");
1373	return false;
1374	}
1375	}
1376
1377	// Check if it is legal to vectorize the loop.
1378	LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
1379	if (!LVL.canVectorize()) {
1380	DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ; } } while (0);
1381	emitMissedWarning(F, L, Hints);
1382	return false;
1383	}
1384
1385	// Use the cost model.
1386	LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F,
1387	&Hints);
1388
1389	// Check the function attributes to find out if this function should be
1390	// optimized for size.
1391	bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1392	F->hasFnAttribute(Attribute::OptimizeForSize);
1393
1394	// Compute the weighted frequency of this loop being executed and see if it
1395	// is less than 20% of the function entry baseline frequency. Note that we
1396	// always have a canonical loop here because we think we can vectoriez.
1397	// FIXME: This is hidden behind a flag due to pervasive problems with
1398	// exactly what block frequency models.
1399	if (LoopVectorizeWithBlockFrequency) {
1400	BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
1401	if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1402	LoopEntryFreq < ColdEntryFreq)
1403	OptForSize = true;
1404	}
1405
1406	// Check the function attributes to see if implicit floats are allowed.a
1407	// FIXME: This check doesn't seem possibly correct -- what if the loop is
1408	// an integer loop and the vector instructions selected are purely integer
1409	// vector instructions?
1410	if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1411	DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0)
1412	"attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0);
1413	emitOptimizationRemarkAnalysis(
1414	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1415	"loop not vectorized due to NoImplicitFloat attribute");
1416	emitMissedWarning(F, L, Hints);
1417	return false;
1418	}
1419
1420	// Select the optimal vectorization factor.
1421	const LoopVectorizationCostModel::VectorizationFactor VF =
1422	CM.selectVectorizationFactor(OptForSize);
1423
1424	// Select the unroll factor.
1425	const unsigned UF =
1426	CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
1427
1428	DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0)
1429	<< DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0);
1430	DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unroll Factor is " << UF << '\n'; } } while (0);
1431
1432	if (VF.Width == 1) {
1433	DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial\n" ; } } while (0);
1434
1435	if (UF == 1) {
1436	emitOptimizationRemarkAnalysis(
1437	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1438	"not beneficial to vectorize and user disabled interleaving");
1439	return false;
1440	}
1441	DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Trying to at least unroll the loops.\n" ; } } while (0);
1442
1443	// Report the unrolling decision.
1444	emitOptimizationRemark(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1445	Twine("unrolled with interleaving factor " +
1446	Twine(UF) +
1447	" (vectorization not beneficial)"));
1448
1449	// We decided not to vectorize, but we may want to unroll.
1450
1451	InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
1452	Unroller.vectorize(&LVL);
1453	} else {
1454	// If we decided that it is legal to vectorize the loop then do it.
1455	InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
1456	LB.vectorize(&LVL);
1457	++LoopsVectorized;
1458
1459	// Report the vectorization decision.
1460	emitOptimizationRemark(
1461	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1462	Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
1463	", unrolling interleave factor: " + Twine(UF) + ")");
1464	}
1465
1466	// Mark the loop as already vectorized to avoid vectorizing again.
1467	Hints.setAlreadyVectorized();
1468
1469	DEBUG(verifyFunction(L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { verifyFunction(L->getHeader()->getParent ()); } } while (0);
1470	return true;
1471	}
1472
1473	void getAnalysisUsage(AnalysisUsage &AU) const override {
1474	AU.addRequired<AssumptionTracker>();
1475	AU.addRequiredID(LoopSimplifyID);
1476	AU.addRequiredID(LCSSAID);
1477	AU.addRequired<BlockFrequencyInfo>();
1478	AU.addRequired<DominatorTreeWrapperPass>();
1479	AU.addRequired<LoopInfo>();
1480	AU.addRequired<ScalarEvolution>();
1481	AU.addRequired<TargetTransformInfo>();
1482	AU.addRequired<AliasAnalysis>();
1483	AU.addPreserved<LoopInfo>();
1484	AU.addPreserved<DominatorTreeWrapperPass>();
1485	AU.addPreserved<AliasAnalysis>();
1486	}
1487
1488	};
1489
1490	} // end anonymous namespace
1491
1492	//===----------------------------------------------------------------------===//
1493	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1494	// LoopVectorizationCostModel.
1495	//===----------------------------------------------------------------------===//
1496
1497	static Value stripIntegerCast(Value V) {
1498	if (CastInst *CI = dyn_cast<CastInst>(V))
1499	if (CI->getOperand(0)->getType()->isIntegerTy())
1500	return CI->getOperand(0);
1501	return V;
1502	}
1503
1504	///\brief Replaces the symbolic stride in a pointer SCEV expression by one.
1505	///
1506	/// If \p OrigPtr is not null, use it to look up the stride value instead of
1507	/// \p Ptr.
1508	static const SCEV replaceSymbolicStrideSCEV(ScalarEvolution SE,
1509	ValueToValueMap &PtrToStride,
1510	Value Ptr, Value OrigPtr = nullptr) {
1511
1512	const SCEV *OrigSCEV = SE->getSCEV(Ptr);
1513
1514	// If there is an entry in the map return the SCEV of the pointer with the
1515	// symbolic stride replaced by one.
1516	ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
1517	if (SI != PtrToStride.end()) {
1518	Value *StrideVal = SI->second;
1519
1520	// Strip casts.
1521	StrideVal = stripIntegerCast(StrideVal);
1522
1523	// Replace symbolic stride by one.
1524	Value *One = ConstantInt::get(StrideVal->getType(), 1);
1525	ValueToValueMap RewriteMap;
1526	RewriteMap[StrideVal] = One;
1527
1528	const SCEV *ByOne =
1529	SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
1530	DEBUG(dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOnedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOne << "\n"; } } while (0)
1531	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOne << "\n"; } } while (0);
1532	return ByOne;
1533	}
1534
1535	// Otherwise, just return the SCEV of the original pointer.
1536	return SE->getSCEV(Ptr);
1537	}
1538
1539	void LoopVectorizationLegality::RuntimePointerCheck::insert(
1540	ScalarEvolution SE, Loop Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
1541	unsigned ASId, ValueToValueMap &Strides) {
1542	// Get the stride replaced scev.
1543	const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
1544	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
1545	assert(AR && "Invalid addrec expression")((AR && "Invalid addrec expression") ? static_cast< void> (0) : __assert_fail ("AR && \"Invalid addrec expression\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1545, __PRETTY_FUNCTION__));
1546	const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
1547	const SCEV ScEnd = AR->evaluateAtIteration(Ex, SE);
1548	Pointers.push_back(Ptr);
1549	Starts.push_back(AR->getStart());
1550	Ends.push_back(ScEnd);
1551	IsWritePtr.push_back(WritePtr);
1552	DependencySetId.push_back(DepSetId);
1553	AliasSetId.push_back(ASId);
1554	}
1555
1556	Value InnerLoopVectorizer::getBroadcastInstrs(Value V) {
1557	// We need to place the broadcast of invariant variables outside the loop.
1558	Instruction *Instr = dyn_cast<Instruction>(V);
1559	bool NewInstr =
1560	(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
1561	Instr->getParent()) != LoopVectorBody.end());
1562	bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1563
1564	// Place the code for broadcasting invariant variables in the new preheader.
1565	IRBuilder<>::InsertPointGuard Guard(Builder);
1566	if (Invariant)
1567	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1568
1569	// Broadcast the scalar into all locations in the vector.
1570	Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1571
1572	return Shuf;
1573	}
1574
1575	Value InnerLoopVectorizer::getConsecutiveVector(Value Val, int StartIdx,
1576	bool Negate) {
1577	assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector" ) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1577, __PRETTY_FUNCTION__));
1578	assert(Val->getType()->getScalarType()->isIntegerTy() &&((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1579, __PRETTY_FUNCTION__))
1579	"Elem must be an integer")((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1579, __PRETTY_FUNCTION__));
1580	// Create the types.
1581	Type *ITy = Val->getType()->getScalarType();
1582	VectorType *Ty = cast<VectorType>(Val->getType());
1583	int VLen = Ty->getNumElements();
1584	SmallVector<Constant*, 8> Indices;
1585
1586	// Create a vector of consecutive numbers from zero to VF.
1587	for (int i = 0; i < VLen; ++i) {
1588	int64_t Idx = Negate ? (-i) : i;
1589	Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
1590	}
1591
1592	// Add the consecutive indices to the vector value.
1593	Constant *Cv = ConstantVector::get(Indices);
1594	assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec" ) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1594, __PRETTY_FUNCTION__));
1595	return Builder.CreateAdd(Val, Cv, "induction");
1596	}
1597
1598	/// \brief Find the operand of the GEP that should be checked for consecutive
1599	/// stores. This ignores trailing indices that have no effect on the final
1600	/// pointer.
1601	static unsigned getGEPInductionOperand(const DataLayout *DL,
1602	const GetElementPtrInst *Gep) {
1603	unsigned LastOperand = Gep->getNumOperands() - 1;
1604	unsigned GEPAllocSize = DL->getTypeAllocSize(
1605	cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
1606
1607	// Walk backwards and try to peel off zeros.
1608	while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
1609	// Find the type we're currently indexing into.
1610	gep_type_iterator GEPTI = gep_type_begin(Gep);
1611	std::advance(GEPTI, LastOperand - 1);
1612
1613	// If it's a type with the same allocation size as the result of the GEP we
1614	// can peel off the zero index.
1615	if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
1616	break;
1617	--LastOperand;
1618	}
1619
1620	return LastOperand;
1621	}
1622
1623	int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
1624	assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr")((Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1624, __PRETTY_FUNCTION__));
1625	// Make sure that the pointer does not point to structs.
1626	if (Ptr->getType()->getPointerElementType()->isAggregateType())
1627	return 0;
1628
1629	// If this value is a pointer induction variable we know it is consecutive.
1630	PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1631	if (Phi && Inductions.count(Phi)) {
1632	InductionInfo II = Inductions[Phi];
1633	if (IK_PtrInduction == II.IK)
1634	return 1;
1635	else if (IK_ReversePtrInduction == II.IK)
1636	return -1;
1637	}
1638
1639	GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
1640	if (!Gep)
1641	return 0;
1642
1643	unsigned NumOperands = Gep->getNumOperands();
1644	Value *GpPtr = Gep->getPointerOperand();
1645	// If this GEP value is a consecutive pointer induction variable and all of
1646	// the indices are constant then we know it is consecutive. We can
1647	Phi = dyn_cast<PHINode>(GpPtr);
1648	if (Phi && Inductions.count(Phi)) {
1649
1650	// Make sure that the pointer does not point to structs.
1651	PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
1652	if (GepPtrType->getElementType()->isAggregateType())
1653	return 0;
1654
1655	// Make sure that all of the index operands are loop invariant.
1656	for (unsigned i = 1; i < NumOperands; ++i)
1657	if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1658	return 0;
1659
1660	InductionInfo II = Inductions[Phi];
1661	if (IK_PtrInduction == II.IK)
1662	return 1;
1663	else if (IK_ReversePtrInduction == II.IK)
1664	return -1;
1665	}
1666
1667	unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
1668
1669	// Check that all of the gep indices are uniform except for our induction
1670	// operand.
1671	for (unsigned i = 0; i != NumOperands; ++i)
1672	if (i != InductionOperand &&
1673	!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1674	return 0;
1675
1676	// We can emit wide load/stores only if the last non-zero index is the
1677	// induction variable.
1678	const SCEV *Last = nullptr;
1679	if (!Strides.count(Gep))
1680	Last = SE->getSCEV(Gep->getOperand(InductionOperand));
1681	else {
1682	// Because of the multiplication by a stride we can have a s/zext cast.
1683	// We are going to replace this stride by 1 so the cast is safe to ignore.
1684	//
1685	// %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1686	// %0 = trunc i64 %indvars.iv to i32
1687	// %mul = mul i32 %0, %Stride1
1688	// %idxprom = zext i32 %mul to i64 << Safe cast.
1689	// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
1690	//
1691	Last = replaceSymbolicStrideSCEV(SE, Strides,
1692	Gep->getOperand(InductionOperand), Gep);
1693	if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
1694	Last =
1695	(C->getSCEVType() == scSignExtend \|\| C->getSCEVType() == scZeroExtend)
1696	? C->getOperand()
1697	: Last;
1698	}
1699	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
1700	const SCEV Step = AR->getStepRecurrence(SE);
1701
1702	// The memory is consecutive because the last index is consecutive
1703	// and all other indices are loop invariant.
1704	if (Step->isOne())
1705	return 1;
1706	if (Step->isAllOnesValue())
1707	return -1;
1708	}
1709
1710	return 0;
1711	}
1712
1713	bool LoopVectorizationLegality::isUniform(Value *V) {
1714	return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
1715	}
1716
1717	InnerLoopVectorizer::VectorParts&
1718	InnerLoopVectorizer::getVectorValue(Value *V) {
1719	assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used." ) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1719, __PRETTY_FUNCTION__));
1720	assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector" ) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1720, __PRETTY_FUNCTION__));
1721
1722	// If we have a stride that is replaced by one, do it here.
1723	if (Legal->hasStride(V))
1724	V = ConstantInt::get(V->getType(), 1);
1725
1726	// If we have this scalar in the map, return it.
1727	if (WidenMap.has(V))
1728	return WidenMap.get(V);
1729
1730	// If this scalar is unknown, assume that it is a constant or that it is
1731	// loop invariant. Broadcast V and save the value for future uses.
1732	Value *B = getBroadcastInstrs(V);
1733	return WidenMap.splat(V, B);
1734	}
1735
1736	Value InnerLoopVectorizer::reverseVector(Value Vec) {
1737	assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1737, __PRETTY_FUNCTION__));
1738	SmallVector<Constant*, 8> ShuffleMask;
1739	for (unsigned i = 0; i < VF; ++i)
1740	ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1741
1742	return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1743	ConstantVector::get(ShuffleMask),
1744	"reverse");
1745	}
1746
1747	void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
1748	// Attempt to issue a wide load.
1749	LoadInst *LI = dyn_cast<LoadInst>(Instr);
1750	StoreInst *SI = dyn_cast<StoreInst>(Instr);
1751
1752	assert((LI \|\| SI) && "Invalid Load/Store instruction")(((LI \|\| SI) && "Invalid Load/Store instruction") ? static_cast <void> (0) : __assert_fail ("(LI \|\| SI) && \"Invalid Load/Store instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1752, __PRETTY_FUNCTION__));
1753
1754	Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
1755	Type *DataTy = VectorType::get(ScalarDataTy, VF);
1756	Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
1757	unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
1758	// An alignment of 0 means target abi alignment. We need to use the scalar's
1759	// target abi alignment in such a case.
1760	if (!Alignment)
1761	Alignment = DL->getABITypeAlignment(ScalarDataTy);
1762	unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
1763	unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
1764	unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
1765
1766	if (SI && Legal->blockNeedsPredication(SI->getParent()))
1767	return scalarizeInstruction(Instr, true);
1768
1769	if (ScalarAllocatedSize != VectorElementSize)
1770	return scalarizeInstruction(Instr);
1771
1772	// If the pointer is loop invariant or if it is non-consecutive,
1773	// scalarize the load.
1774	int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
1775	bool Reverse = ConsecutiveStride < 0;
1776	bool UniformLoad = LI && Legal->isUniform(Ptr);
1777	if (!ConsecutiveStride \|\| UniformLoad)
1778	return scalarizeInstruction(Instr);
1779
1780	Constant *Zero = Builder.getInt32(0);
1781	VectorParts &Entry = WidenMap.get(Instr);
1782
1783	// Handle consecutive loads/stores.
1784	GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1785	if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
1786	setDebugLocFromInst(Builder, Gep);
1787	Value *PtrOperand = Gep->getPointerOperand();
1788	Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
1789	FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
1790
1791	// Create the new GEP with the new induction variable.
1792	GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1793	Gep2->setOperand(0, FirstBasePtr);
1794	Gep2->setName("gep.indvar.base");
1795	Ptr = Builder.Insert(Gep2);
1796	} else if (Gep) {
1797	setDebugLocFromInst(Builder, Gep);
1798	assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1799, __PRETTY_FUNCTION__))
1799	OrigLoop) && "Base ptr must be invariant")((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1799, __PRETTY_FUNCTION__));
1800
1801	// The last index does not have to be the induction. It can be
1802	// consecutive and be a function of the index. For example A[I+1];
1803	unsigned NumOperands = Gep->getNumOperands();
1804	unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
1805	// Create the new GEP with the new induction variable.
1806	GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1807
1808	for (unsigned i = 0; i < NumOperands; ++i) {
1809	Value *GepOperand = Gep->getOperand(i);
1810	Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
1811
1812	// Update last index or loop invariant instruction anchored in loop.
1813	if (i == InductionOperand \|\|
1814	(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
1815	assert((i == InductionOperand \|\|(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__))
1816	SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__))
1817	"Must be last index or loop invariant")(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__));
1818
1819	VectorParts &GEPParts = getVectorValue(GepOperand);
1820	Value *Index = GEPParts[0];
1821	Index = Builder.CreateExtractElement(Index, Zero);
1822	Gep2->setOperand(i, Index);
1823	Gep2->setName("gep.indvar.idx");
1824	}
1825	}
1826	Ptr = Builder.Insert(Gep2);
1827	} else {
1828	// Use the induction element ptr.
1829	assert(isa<PHINode>(Ptr) && "Invalid induction ptr")((isa<PHINode>(Ptr) && "Invalid induction ptr") ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(Ptr) && \"Invalid induction ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1829, __PRETTY_FUNCTION__));
1830	setDebugLocFromInst(Builder, Ptr);
1831	VectorParts &PtrVal = getVectorValue(Ptr);
1832	Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
1833	}
1834
1835	// Handle Stores:
1836	if (SI) {
1837	assert(!Legal->isUniform(SI->getPointerOperand()) &&((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__))
1838	"We do not allow storing to uniform addresses")((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__));
1839	setDebugLocFromInst(Builder, SI);
1840	// We don't want to update the value in the map as it might be used in
1841	// another expression. So don't use a reference type for "StoredVal".
1842	VectorParts StoredVal = getVectorValue(SI->getValueOperand());
1843
1844	for (unsigned Part = 0; Part < UF; ++Part) {
1845	// Calculate the pointer for the specific unroll-part.
1846	Value PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part VF));
1847
1848	if (Reverse) {
1849	// If we store to reverse consecutive memory locations then we need
1850	// to reverse the order of elements in the stored value.
1851	StoredVal[Part] = reverseVector(StoredVal[Part]);
1852	// If the address is consecutive but reversed, then the
1853	// wide store needs to start at the last vector element.
1854	PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1855	PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1856	}
1857
1858	Value *VecPtr = Builder.CreateBitCast(PartPtr,
1859	DataTy->getPointerTo(AddressSpace));
1860	StoreInst *NewSI =
1861	Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
1862	propagateMetadata(NewSI, SI);
1863	}
1864	return;
1865	}
1866
1867	// Handle loads.
1868	assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast <void> (0) : __assert_fail ("LI && \"Must have a load instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1868, __PRETTY_FUNCTION__));
1869	setDebugLocFromInst(Builder, LI);
1870	for (unsigned Part = 0; Part < UF; ++Part) {
1871	// Calculate the pointer for the specific unroll-part.
1872	Value PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part VF));
1873
1874	if (Reverse) {
1875	// If the address is consecutive but reversed, then the
1876	// wide store needs to start at the last vector element.
1877	PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1878	PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1879	}
1880
1881	Value *VecPtr = Builder.CreateBitCast(PartPtr,
1882	DataTy->getPointerTo(AddressSpace));
1883	LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
1884	propagateMetadata(NewLI, LI);
1885	Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
1886	}
1887	}
1888
1889	void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
1890	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1890, __PRETTY_FUNCTION__));
1891	// Holds vector parameters or scalars, in case of uniform vals.
1892	SmallVector<VectorParts, 4> Params;
1893
1894	setDebugLocFromInst(Builder, Instr);
1895
1896	// Find all of the vectorized parameters.
1897	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1898	Value *SrcOp = Instr->getOperand(op);
1899
1900	// If we are accessing the old induction variable, use the new one.
1901	if (SrcOp == OldInduction) {
1902	Params.push_back(getVectorValue(SrcOp));
1903	continue;
1904	}
1905
1906	// Try using previously calculated values.
1907	Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
1908
1909	// If the src is an instruction that appeared earlier in the basic block
1910	// then it should already be vectorized.
1911	if (SrcInst && OrigLoop->contains(SrcInst)) {
1912	assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1912, __PRETTY_FUNCTION__));
1913	// The parameter is a vector value from earlier.
1914	Params.push_back(WidenMap.get(SrcInst));
1915	} else {
1916	// The parameter is a scalar from outside the loop. Maybe even a constant.
1917	VectorParts Scalars;
1918	Scalars.append(UF, SrcOp);
1919	Params.push_back(Scalars);
1920	}
1921	}
1922
1923	assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1924, __PRETTY_FUNCTION__))
1924	"Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1924, __PRETTY_FUNCTION__));
1925
1926	// Does this instruction return a value ?
1927	bool IsVoidRetTy = Instr->getType()->isVoidTy();
1928
1929	Value *UndefVec = IsVoidRetTy ? nullptr :
1930	UndefValue::get(VectorType::get(Instr->getType(), VF));
1931	// Create a new entry in the WidenMap and initialize it to Undef or Null.
1932	VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
1933
1934	Instruction *InsertPt = Builder.GetInsertPoint();
1935	BasicBlock *IfBlock = Builder.GetInsertBlock();
1936	BasicBlock *CondBlock = nullptr;
1937
1938	VectorParts Cond;
1939	Loop *VectorLp = nullptr;
1940	if (IfPredicateStore) {
1941	assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1942, __PRETTY_FUNCTION__))
1942	"Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1942, __PRETTY_FUNCTION__));
1943	Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
1944	Instr->getParent());
1945	VectorLp = LI->getLoopFor(IfBlock);
1946	assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1946, __PRETTY_FUNCTION__));
1947	}
1948
1949	// For each vector unroll 'part':
1950	for (unsigned Part = 0; Part < UF; ++Part) {
1951	// For each scalar that we create:
1952	for (unsigned Width = 0; Width < VF; ++Width) {
1953
1954	// Start if-block.
1955	Value *Cmp = nullptr;
1956	if (IfPredicateStore) {
1957	Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
1958	Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
1959	CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
1960	LoopVectorBody.push_back(CondBlock);
1961	VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
1962	// Update Builder with newly created basic block.
1963	Builder.SetInsertPoint(InsertPt);
1964	}
1965
1966	Instruction *Cloned = Instr->clone();
1967	if (!IsVoidRetTy)
1968	Cloned->setName(Instr->getName() + ".cloned");
1969	// Replace the operands of the cloned instructions with extracted scalars.
1970	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1971	Value *Op = Params[op][Part];
1972	// Param is a vector. Need to extract the right lane.
1973	if (Op->getType()->isVectorTy())
1974	Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
1975	Cloned->setOperand(op, Op);
1976	}
1977
1978	// Place the cloned scalar in the new loop.
1979	Builder.Insert(Cloned);
1980
1981	// If the original scalar returns a value we need to place it in a vector
1982	// so that future users will be able to use it.
1983	if (!IsVoidRetTy)
1984	VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
1985	Builder.getInt32(Width));
1986	// End if-block.
1987	if (IfPredicateStore) {
1988	BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
1989	LoopVectorBody.push_back(NewIfBlock);
1990	VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
1991	Builder.SetInsertPoint(InsertPt);
1992	Instruction *OldBr = IfBlock->getTerminator();
1993	BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1994	OldBr->eraseFromParent();
1995	IfBlock = NewIfBlock;
1996	}
1997	}
1998	}
1999	}
2000
2001	static Instruction getFirstInst(Instruction FirstInst, Value *V,
2002	Instruction *Loc) {
2003	if (FirstInst)
2004	return FirstInst;
2005	if (Instruction *I = dyn_cast<Instruction>(V))
2006	return I->getParent() == Loc->getParent() ? I : nullptr;
2007	return nullptr;
2008	}
2009
2010	std::pair<Instruction , Instruction >
2011	InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
2012	Instruction *tnullptr = nullptr;
2013	if (!Legal->mustCheckStrides())
2014	return std::pair<Instruction , Instruction >(tnullptr, tnullptr);
2015
2016	IRBuilder<> ChkBuilder(Loc);
2017
2018	// Emit checks.
2019	Value *Check = nullptr;
2020	Instruction *FirstInst = nullptr;
2021	for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
2022	SE = Legal->strides_end();
2023	SI != SE; ++SI) {
2024	Value Ptr = stripIntegerCast(SI);
2025	Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
2026	"stride.chk");
2027	// Store the first instruction we create.
2028	FirstInst = getFirstInst(FirstInst, C, Loc);
2029	if (Check)
2030	Check = ChkBuilder.CreateOr(Check, C);
2031	else
2032	Check = C;
2033	}
2034
2035	// We have to do this trickery because the IRBuilder might fold the check to a
2036	// constant expression in which case there is no Instruction anchored in a
2037	// the block.
2038	LLVMContext &Ctx = Loc->getContext();
2039	Instruction *TheCheck =
2040	BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
2041	ChkBuilder.Insert(TheCheck, "stride.not.one");
2042	FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
2043
2044	return std::make_pair(FirstInst, TheCheck);
2045	}
2046
2047	std::pair<Instruction , Instruction >
2048	InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
2049	LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
2050	Legal->getRuntimePointerCheck();
2051
2052	Instruction *tnullptr = nullptr;
2053	if (!PtrRtCheck->Need)
2054	return std::pair<Instruction , Instruction >(tnullptr, tnullptr);
2055
2056	unsigned NumPointers = PtrRtCheck->Pointers.size();
2057	SmallVector<TrackingVH<Value> , 2> Starts;
2058	SmallVector<TrackingVH<Value> , 2> Ends;
2059
2060	LLVMContext &Ctx = Loc->getContext();
2061	SCEVExpander Exp(*SE, "induction");
2062	Instruction *FirstInst = nullptr;
2063
2064	for (unsigned i = 0; i < NumPointers; ++i) {
2065	Value *Ptr = PtrRtCheck->Pointers[i];
2066	const SCEV *Sc = SE->getSCEV(Ptr);
2067
2068	if (SE->isLoopInvariant(Sc, OrigLoop)) {
2069	DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0)
2070	Ptr <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << Ptr <<"\n"; } } while (0);
2071	Starts.push_back(Ptr);
2072	Ends.push_back(Ptr);
2073	} else {
2074	DEBUG(dbgs() << "LV: Adding RT check for range:" << Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for range:" << Ptr << '\n'; } } while (0);
2075	unsigned AS = Ptr->getType()->getPointerAddressSpace();
2076
2077	// Use this type for pointer arithmetic.
2078	Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
2079
2080	Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
2081	Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
2082	Starts.push_back(Start);
2083	Ends.push_back(End);
2084	}
2085	}
2086
2087	IRBuilder<> ChkBuilder(Loc);
2088	// Our instructions might fold to a constant.
2089	Value *MemoryRuntimeCheck = nullptr;
2090	for (unsigned i = 0; i < NumPointers; ++i) {
2091	for (unsigned j = i+1; j < NumPointers; ++j) {
2092	// No need to check if two readonly pointers intersect.
2093	if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
2094	continue;
2095
2096	// Only need to check pointers between two different dependency sets.
2097	if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
2098	continue;
2099	// Only need to check pointers in the same alias set.
2100	if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
2101	continue;
2102
2103	unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
2104	unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
2105
2106	assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__))
2107	(AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__))
2108	"Trying to bounds check pointers with different address spaces")(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__));
2109
2110	Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
2111	Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
2112
2113	Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
2114	Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
2115	Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc");
2116	Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc");
2117
2118	Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
2119	FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
2120	Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
2121	FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
2122	Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
2123	FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2124	if (MemoryRuntimeCheck) {
2125	IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
2126	"conflict.rdx");
2127	FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2128	}
2129	MemoryRuntimeCheck = IsConflict;
2130	}
2131	}
2132
2133	// We have to do this trickery because the IRBuilder might fold the check to a
2134	// constant expression in which case there is no Instruction anchored in a
2135	// the block.
2136	Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
2137	ConstantInt::getTrue(Ctx));
2138	ChkBuilder.Insert(Check, "memcheck.conflict");
2139	FirstInst = getFirstInst(FirstInst, Check, Loc);
2140	return std::make_pair(FirstInst, Check);
2141	}
2142
2143	void InnerLoopVectorizer::createEmptyLoop() {
2144	/*
2145	In this function we generate a new loop. The new loop will contain
2146	the vectorized instructions while the old loop will continue to run the
2147	scalar remainder.
2148
2149	[ ] <-- Back-edge taken count overflow check.
2150	/ \|
2151	/ v
2152	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
2153	\| / \|
2154	\| / v
2155	\|\| [ ] <-- vector pre header.
2156	\|\| \|
2157	\|\| v
2158	\|\| [ ] \
2159	\|\| [ ]_\| <-- vector loop.
2160	\|\| \|
2161	\| \ v
2162	\| >[ ] <--- middle-block.
2163	\| / \|
2164	\| / v
2165	-\|- >[ ] <--- new preheader.
2166	\| \|
2167	\| v
2168	\| [ ] \
2169	\| [ ]_\| <-- old scalar loop to handle remainder.
2170	\ \|
2171	\ v
2172	>[ ] <-- exit block.
2173	...
2174	*/
2175
2176	BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2177	BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
2178	BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2179	assert(BypassBlock && "Invalid loop structure")((BypassBlock && "Invalid loop structure") ? static_cast <void> (0) : __assert_fail ("BypassBlock && \"Invalid loop structure\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2179, __PRETTY_FUNCTION__));
2180	assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast <void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2180, __PRETTY_FUNCTION__));
2181
2182	// Some loops have a single integer induction variable, while other loops
2183	// don't. One example is c++ iterators that often have multiple pointer
2184	// induction variables. In the code below we also support a case where we
2185	// don't have a single induction variable.
2186	OldInduction = Legal->getInduction();
2187	Type *IdxTy = Legal->getWidestInductionType();
2188
2189	// Find the loop boundaries.
2190	const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
2191	assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count")((ExitCount != SE->getCouldNotCompute() && "Invalid loop count" ) ? static_cast<void> (0) : __assert_fail ("ExitCount != SE->getCouldNotCompute() && \"Invalid loop count\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2191, __PRETTY_FUNCTION__));
2192
2193	// The exit count might have the type of i64 while the phi is i32. This can
2194	// happen if we have an induction variable that is sign extended before the
2195	// compare. The only way that we get a backedge taken count is that the
2196	// induction variable was signed and as such will not overflow. In such a case
2197	// truncation is legal.
2198	if (ExitCount->getType()->getPrimitiveSizeInBits() >
2199	IdxTy->getPrimitiveSizeInBits())
2200	ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
2201
2202	const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
2203	// Get the total trip count from the count by adding 1.
2204	ExitCount = SE->getAddExpr(BackedgeTakeCount,
2205	SE->getConstant(BackedgeTakeCount->getType(), 1));
2206
2207	// Expand the trip count and place the new instructions in the preheader.
2208	// Notice that the pre-header does not change, only the loop body.
2209	SCEVExpander Exp(*SE, "induction");
2210
2211	// We need to test whether the backedge-taken count is uint##_max. Adding one
2212	// to it will cause overflow and an incorrect loop trip count in the vector
2213	// body. In case of overflow we want to directly jump to the scalar remainder
2214	// loop.
2215	Value *BackedgeCount =
2216	Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
2217	BypassBlock->getTerminator());
2218	if (BackedgeCount->getType()->isPointerTy())
2219	BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
2220	"backedge.ptrcnt.to.int",
2221	BypassBlock->getTerminator());
2222	Instruction *CheckBCOverflow =
2223	CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
2224	Constant::getAllOnesValue(BackedgeCount->getType()),
2225	"backedge.overflow", BypassBlock->getTerminator());
2226
2227	// The loop index does not have to start at Zero. Find the original start
2228	// value from the induction PHI node. If we don't have an induction variable
2229	// then we know that it starts at zero.
2230	Builder.SetInsertPoint(BypassBlock->getTerminator());
2231	Value *StartIdx = ExtendedIdx = OldInduction ?
2232	Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
2233	IdxTy):
2234	ConstantInt::get(IdxTy, 0);
2235
2236	// We need an instruction to anchor the overflow check on. StartIdx needs to
2237	// be defined before the overflow check branch. Because the scalar preheader
2238	// is going to merge the start index and so the overflow branch block needs to
2239	// contain a definition of the start index.
2240	Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
2241	StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
2242	BypassBlock->getTerminator());
2243
2244	// Count holds the overall loop count (N).
2245	Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2246	BypassBlock->getTerminator());
2247
2248	LoopBypassBlocks.push_back(BypassBlock);
2249
2250	// Split the single block loop into the two loop structure described above.
2251	BasicBlock *VectorPH =
2252	BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
2253	BasicBlock *VecBody =
2254	VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2255	BasicBlock *MiddleBlock =
2256	VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2257	BasicBlock *ScalarPH =
2258	MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2259
2260	// Create and register the new vector loop.
2261	Loop* Lp = new Loop();
2262	Loop *ParentLoop = OrigLoop->getParentLoop();
2263
2264	// Insert the new loop into the loop nest and register the new basic blocks
2265	// before calling any utilities such as SCEV that require valid LoopInfo.
2266	if (ParentLoop) {
2267	ParentLoop->addChildLoop(Lp);
2268	ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
2269	ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
2270	ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
2271	} else {
2272	LI->addTopLevelLoop(Lp);
2273	}
2274	Lp->addBasicBlockToLoop(VecBody, LI->getBase());
2275
2276	// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
2277	// inside the loop.
2278	Builder.SetInsertPoint(VecBody->getFirstNonPHI());
2279
2280	// Generate the induction variable.
2281	setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
2282	Induction = Builder.CreatePHI(IdxTy, 2, "index");
2283	// The loop step is equal to the vectorization factor (num of SIMD elements)
2284	// times the unroll factor (num of SIMD instructions).
2285	Constant Step = ConstantInt::get(IdxTy, VF UF);
2286
2287	// This is the IR builder that we use to add all of the logic for bypassing
2288	// the new vector loop.
2289	IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
2290	setDebugLocFromInst(BypassBuilder,
2291	getDebugLocFromInstOrOperands(OldInduction));
2292
2293	// We may need to extend the index in case there is a type mismatch.
2294	// We know that the count starts at zero and does not overflow.
2295	if (Count->getType() != IdxTy) {
2296	// The exit count can be of pointer type. Convert it to the correct
2297	// integer type.
2298	if (ExitCount->getType()->isPointerTy())
2299	Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
2300	else
2301	Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
2302	}
2303
2304	// Add the start index to the loop count to get the new end index.
2305	Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
2306
2307	// Now we need to generate the expression for N - (N % VF), which is
2308	// the part that the vectorized body will execute.
2309	Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
2310	Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
2311	Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
2312	"end.idx.rnd.down");
2313
2314	// Now, compare the new count to zero. If it is zero skip the vector loop and
2315	// jump to the scalar loop.
2316	Value *Cmp =
2317	BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
2318
2319	BasicBlock *LastBypassBlock = BypassBlock;
2320
2321	// Generate code to check that the loops trip count that we computed by adding
2322	// one to the backedge-taken count will not overflow.
2323	{
2324	auto PastOverflowCheck =
2325	std::next(BasicBlock::iterator(OverflowCheckAnchor));
2326	BasicBlock *CheckBlock =
2327	LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
2328	if (ParentLoop)
2329	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2330	LoopBypassBlocks.push_back(CheckBlock);
2331	Instruction *OldTerm = LastBypassBlock->getTerminator();
2332	BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
2333	OldTerm->eraseFromParent();
2334	LastBypassBlock = CheckBlock;
2335	}
2336
2337	// Generate the code to check that the strides we assumed to be one are really
2338	// one. We want the new basic block to start at the first instruction in a
2339	// sequence of instructions that form a check.
2340	Instruction *StrideCheck;
2341	Instruction *FirstCheckInst;
2342	std::tie(FirstCheckInst, StrideCheck) =
2343	addStrideCheck(LastBypassBlock->getTerminator());
2344	if (StrideCheck) {
2345	// Create a new block containing the stride check.
2346	BasicBlock *CheckBlock =
2347	LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
2348	if (ParentLoop)
2349	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2350	LoopBypassBlocks.push_back(CheckBlock);
2351
2352	// Replace the branch into the memory check block with a conditional branch
2353	// for the "few elements case".
2354	Instruction *OldTerm = LastBypassBlock->getTerminator();
2355	BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
2356	OldTerm->eraseFromParent();
2357
2358	Cmp = StrideCheck;
2359	LastBypassBlock = CheckBlock;
2360	}
2361
2362	// Generate the code that checks in runtime if arrays overlap. We put the
2363	// checks into a separate block to make the more common case of few elements
2364	// faster.
2365	Instruction *MemRuntimeCheck;
2366	std::tie(FirstCheckInst, MemRuntimeCheck) =
2367	addRuntimeCheck(LastBypassBlock->getTerminator());
2368	if (MemRuntimeCheck) {
2369	// Create a new block containing the memory check.
2370	BasicBlock *CheckBlock =
2371	LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck");
2372	if (ParentLoop)
2373	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2374	LoopBypassBlocks.push_back(CheckBlock);
2375
2376	// Replace the branch into the memory check block with a conditional branch
2377	// for the "few elements case".
2378	Instruction *OldTerm = LastBypassBlock->getTerminator();
2379	BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
2380	OldTerm->eraseFromParent();
2381
2382	Cmp = MemRuntimeCheck;
2383	LastBypassBlock = CheckBlock;
2384	}
2385
2386	LastBypassBlock->getTerminator()->eraseFromParent();
2387	BranchInst::Create(MiddleBlock, VectorPH, Cmp,
2388	LastBypassBlock);
2389
2390	// We are going to resume the execution of the scalar loop.
2391	// Go over all of the induction variables that we found and fix the
2392	// PHIs that are left in the scalar version of the loop.
2393	// The starting values of PHI nodes depend on the counter of the last
2394	// iteration in the vectorized loop.
2395	// If we come from a bypass edge then we need to start from the original
2396	// start value.
2397
2398	// This variable saves the new starting index for the scalar loop.
2399	PHINode *ResumeIndex = nullptr;
2400	LoopVectorizationLegality::InductionList::iterator I, E;
2401	LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2402	// Set builder to point to last bypass block.
2403	BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
2404	for (I = List->begin(), E = List->end(); I != E; ++I) {
2405	PHINode *OrigPhi = I->first;
2406	LoopVectorizationLegality::InductionInfo II = I->second;
2407
2408	Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
2409	PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
2410	MiddleBlock->getTerminator());
2411	// We might have extended the type of the induction variable but we need a
2412	// truncated version for the scalar loop.
2413	PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
2414	PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
2415	MiddleBlock->getTerminator()) : nullptr;
2416
2417	// Create phi nodes to merge from the backedge-taken check block.
2418	PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
2419	ScalarPH->getTerminator());
2420	BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
2421
2422	PHINode *BCTruncResumeVal = nullptr;
2423	if (OrigPhi == OldInduction) {
2424	BCTruncResumeVal =
2425	PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
2426	ScalarPH->getTerminator());
2427	BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
2428	}
2429
2430	Value *EndValue = nullptr;
2431	switch (II.IK) {
2432	case LoopVectorizationLegality::IK_NoInduction:
2433	llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2433);
2434	case LoopVectorizationLegality::IK_IntInduction: {
2435	// Handle the integer induction counter.
2436	assert(OrigPhi->getType()->isIntegerTy() && "Invalid type")((OrigPhi->getType()->isIntegerTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("OrigPhi->getType()->isIntegerTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2436, __PRETTY_FUNCTION__));
2437
2438	// We have the canonical induction variable.
2439	if (OrigPhi == OldInduction) {
2440	// Create a truncated version of the resume value for the scalar loop,
2441	// we might have promoted the type to a larger width.
2442	EndValue =
2443	BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
2444	// The new PHI merges the original incoming value, in case of a bypass,
2445	// or the value at the end of the vectorized loop.
2446	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2447	TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2448	TruncResumeVal->addIncoming(EndValue, VecBody);
2449
2450	BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2451
2452	// We know what the end value is.
2453	EndValue = IdxEndRoundDown;
2454	// We also know which PHI node holds it.
2455	ResumeIndex = ResumeVal;
2456	break;
2457	}
2458
2459	// Not the canonical induction variable - add the vector loop count to the
2460	// start value.
2461	Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2462	II.StartValue->getType(),
2463	"cast.crd");
2464	EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
2465	break;
2466	}
2467	case LoopVectorizationLegality::IK_ReverseIntInduction: {
2468	// Convert the CountRoundDown variable to the PHI size.
2469	Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2470	II.StartValue->getType(),
2471	"cast.crd");
2472	// Handle reverse integer induction counter.
2473	EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
2474	break;
2475	}
2476	case LoopVectorizationLegality::IK_PtrInduction: {
2477	// For pointer induction variables, calculate the offset using
2478	// the end index.
2479	EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
2480	"ptr.ind.end");
2481	break;
2482	}
2483	case LoopVectorizationLegality::IK_ReversePtrInduction: {
2484	// The value at the end of the loop for the reverse pointer is calculated
2485	// by creating a GEP with a negative index starting from the start value.
2486	Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
2487	Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
2488	"rev.ind.end");
2489	EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
2490	"rev.ptr.ind.end");
2491	break;
2492	}
2493	}// end of case
2494
2495	// The new PHI merges the original incoming value, in case of a bypass,
2496	// or the value at the end of the vectorized loop.
2497	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
2498	if (OrigPhi == OldInduction)
2499	ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
2500	else
2501	ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2502	}
2503	ResumeVal->addIncoming(EndValue, VecBody);
2504
2505	// Fix the scalar body counter (PHI node).
2506	unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2507
2508	// The old induction's phi node in the scalar body needs the truncated
2509	// value.
2510	if (OrigPhi == OldInduction) {
2511	BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
2512	OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
2513	} else {
2514	BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2515	OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2516	}
2517	}
2518
2519	// If we are generating a new induction variable then we also need to
2520	// generate the code that calculates the exit value. This value is not
2521	// simply the end of the counter because we may skip the vectorized body
2522	// in case of a runtime check.
2523	if (!OldInduction){
2524	assert(!ResumeIndex && "Unexpected resume value found")((!ResumeIndex && "Unexpected resume value found") ? static_cast <void> (0) : __assert_fail ("!ResumeIndex && \"Unexpected resume value found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2524, __PRETTY_FUNCTION__));
2525	ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
2526	MiddleBlock->getTerminator());
2527	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2528	ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
2529	ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
2530	}
2531
2532	// Make sure that we found the index where scalar loop needs to continue.
2533	assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2534, __PRETTY_FUNCTION__))
2534	"Invalid resume Index")((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2534, __PRETTY_FUNCTION__));
2535
2536	// Add a check in the middle block to see if we have completed
2537	// all of the iterations in the first vector loop.
2538	// If (N - N%VF) == N, then we don't need to run the remainder.
2539	Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
2540	ResumeIndex, "cmp.n",
2541	MiddleBlock->getTerminator());
2542
2543	BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
2544	// Remove the old terminator.
2545	MiddleBlock->getTerminator()->eraseFromParent();
2546
2547	// Create i+1 and fill the PHINode.
2548	Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
2549	Induction->addIncoming(StartIdx, VectorPH);
2550	Induction->addIncoming(NextIdx, VecBody);
2551	// Create the compare.
2552	Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
2553	Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
2554
2555	// Now we have two terminators. Remove the old one from the block.
2556	VecBody->getTerminator()->eraseFromParent();
2557
2558	// Get ready to start creating new instructions into the vectorized body.
2559	Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
2560
2561	// Save the state.
2562	LoopVectorPreHeader = VectorPH;
2563	LoopScalarPreHeader = ScalarPH;
2564	LoopMiddleBlock = MiddleBlock;
2565	LoopExitBlock = ExitBlock;
2566	LoopVectorBody.push_back(VecBody);
2567	LoopScalarBody = OldBasicBlock;
2568
2569	LoopVectorizeHints Hints(Lp, true);
2570	Hints.setAlreadyVectorized();
2571	}
2572
2573	/// This function returns the identity element (or neutral element) for
2574	/// the operation K.
2575	Constant*
2576	LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
2577	switch (K) {
2578	case RK_IntegerXor:
2579	case RK_IntegerAdd:
2580	case RK_IntegerOr:
2581	// Adding, Xoring, Oring zero to a number does not change it.
2582	return ConstantInt::get(Tp, 0);
2583	case RK_IntegerMult:
2584	// Multiplying a number by 1 does not change it.
2585	return ConstantInt::get(Tp, 1);
2586	case RK_IntegerAnd:
2587	// AND-ing a number with an all-1 value does not change it.
2588	return ConstantInt::get(Tp, -1, true);
2589	case RK_FloatMult:
2590	// Multiplying a number by 1 does not change it.
2591	return ConstantFP::get(Tp, 1.0L);
2592	case RK_FloatAdd:
2593	// Adding zero to a number does not change it.
2594	return ConstantFP::get(Tp, 0.0L);
2595	default:
2596	llvm_unreachable("Unknown reduction kind")::llvm::llvm_unreachable_internal("Unknown reduction kind", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2596);
2597	}
2598	}
2599
2600	/// This function translates the reduction kind to an LLVM binary operator.
2601	static unsigned
2602	getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
2603	switch (Kind) {
2604	case LoopVectorizationLegality::RK_IntegerAdd:
2605	return Instruction::Add;
2606	case LoopVectorizationLegality::RK_IntegerMult:
2607	return Instruction::Mul;
2608	case LoopVectorizationLegality::RK_IntegerOr:
2609	return Instruction::Or;
2610	case LoopVectorizationLegality::RK_IntegerAnd:
2611	return Instruction::And;
2612	case LoopVectorizationLegality::RK_IntegerXor:
2613	return Instruction::Xor;
2614	case LoopVectorizationLegality::RK_FloatMult:
2615	return Instruction::FMul;
2616	case LoopVectorizationLegality::RK_FloatAdd:
2617	return Instruction::FAdd;
2618	case LoopVectorizationLegality::RK_IntegerMinMax:
2619	return Instruction::ICmp;
2620	case LoopVectorizationLegality::RK_FloatMinMax:
2621	return Instruction::FCmp;
2622	default:
2623	llvm_unreachable("Unknown reduction operation")::llvm::llvm_unreachable_internal("Unknown reduction operation" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2623);
2624	}
2625	}
2626
2627	Value *createMinMaxOp(IRBuilder<> &Builder,
2628	LoopVectorizationLegality::MinMaxReductionKind RK,
2629	Value *Left,
2630	Value *Right) {
2631	CmpInst::Predicate P = CmpInst::ICMP_NE;
2632	switch (RK) {
2633	default:
2634	llvm_unreachable("Unknown min/max reduction kind")::llvm::llvm_unreachable_internal("Unknown min/max reduction kind" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2634);
2635	case LoopVectorizationLegality::MRK_UIntMin:
2636	P = CmpInst::ICMP_ULT;
2637	break;
2638	case LoopVectorizationLegality::MRK_UIntMax:
2639	P = CmpInst::ICMP_UGT;
2640	break;
2641	case LoopVectorizationLegality::MRK_SIntMin:
2642	P = CmpInst::ICMP_SLT;
2643	break;
2644	case LoopVectorizationLegality::MRK_SIntMax:
2645	P = CmpInst::ICMP_SGT;
2646	break;
2647	case LoopVectorizationLegality::MRK_FloatMin:
2648	P = CmpInst::FCMP_OLT;
2649	break;
2650	case LoopVectorizationLegality::MRK_FloatMax:
2651	P = CmpInst::FCMP_OGT;
2652	break;
2653	}
2654
2655	Value *Cmp;
2656	if (RK == LoopVectorizationLegality::MRK_FloatMin \|\|
2657	RK == LoopVectorizationLegality::MRK_FloatMax)
2658	Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
2659	else
2660	Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
2661
2662	Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
2663	return Select;
2664	}
2665
2666	namespace {
2667	struct CSEDenseMapInfo {
2668	static bool canHandle(Instruction *I) {
2669	return isa<InsertElementInst>(I) \|\| isa<ExtractElementInst>(I) \|\|
2670	isa<ShuffleVectorInst>(I) \|\| isa<GetElementPtrInst>(I);
2671	}
2672	static inline Instruction *getEmptyKey() {
2673	return DenseMapInfo<Instruction *>::getEmptyKey();
2674	}
2675	static inline Instruction *getTombstoneKey() {
2676	return DenseMapInfo<Instruction *>::getTombstoneKey();
2677	}
2678	static unsigned getHashValue(Instruction *I) {
2679	assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast <void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2679, __PRETTY_FUNCTION__));
2680	return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2681	I->value_op_end()));
2682	}
2683	static bool isEqual(Instruction LHS, Instruction RHS) {
2684	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
2685	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
2686	return LHS == RHS;
2687	return LHS->isIdenticalTo(RHS);
2688	}
2689	};
2690	}
2691
2692	/// \brief Check whether this block is a predicated block.
2693	/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
2694	/// = ...; " blocks. We start with one vectorized basic block. For every
2695	/// conditional block we split this vectorized block. Therefore, every second
2696	/// block will be a predicated one.
2697	static bool isPredicatedBlock(unsigned BlockNum) {
2698	return BlockNum % 2;
2699	}
2700
2701	///\brief Perform cse of induction variable instructions.
2702	static void cse(SmallVector<BasicBlock *, 4> &BBs) {
2703	// Perform simple cse.
2704	SmallDenseMap<Instruction , Instruction , 4, CSEDenseMapInfo> CSEMap;
2705	for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
2706	BasicBlock *BB = BBs[i];
2707	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2708	Instruction *In = I++;
2709
2710	if (!CSEDenseMapInfo::canHandle(In))
2711	continue;
2712
2713	// Check if we can replace this instruction with any of the
2714	// visited instructions.
2715	if (Instruction *V = CSEMap.lookup(In)) {
2716	In->replaceAllUsesWith(V);
2717	In->eraseFromParent();
2718	continue;
2719	}
2720	// Ignore instructions in conditional blocks. We create "if (pred) a[i] =
2721	// ...;" blocks for predicated stores. Every second block is a predicated
2722	// block.
2723	if (isPredicatedBlock(i))
2724	continue;
2725
2726	CSEMap[In] = In;
2727	}
2728	}
2729	}
2730
2731	/// \brief Adds a 'fast' flag to floating point operations.
2732	static Value addFastMathFlag(Value V) {
2733	if (isa<FPMathOperator>(V)){
2734	FastMathFlags Flags;
2735	Flags.setUnsafeAlgebra();
2736	cast<Instruction>(V)->setFastMathFlags(Flags);
2737	}
2738	return V;
2739	}
2740
2741	void InnerLoopVectorizer::vectorizeLoop() {
2742	//===------------------------------------------------===//
2743	//
2744	// Notice: any optimization or new instruction that go
2745	// into the code below should be also be implemented in
2746	// the cost-model.
2747	//
2748	//===------------------------------------------------===//
2749	Constant *Zero = Builder.getInt32(0);
2750
2751	// In order to support reduction variables we need to be able to vectorize
2752	// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
2753	// stages. First, we create a new vector PHI node with no incoming edges.
2754	// We use this value when we vectorize all of the instructions that use the
2755	// PHI. Next, after all of the instructions in the block are complete we
2756	// add the new incoming edges to the PHI. At this point all of the
2757	// instructions in the basic block are vectorized, so we can use them to
2758	// construct the PHI.
2759	PhiVector RdxPHIsToFix;
2760
2761	// Scan the loop in a topological order to ensure that defs are vectorized
2762	// before users.
2763	LoopBlocksDFS DFS(OrigLoop);
2764	DFS.perform(LI);
2765
2766	// Vectorize all of the blocks in the original loop.
2767	for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
2768	be = DFS.endRPO(); bb != be; ++bb)
2769	vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
2770
2771	// At this point every instruction in the original loop is widened to
2772	// a vector form. We are almost done. Now, we need to fix the PHI nodes
2773	// that we vectorized. The PHI nodes are currently empty because we did
2774	// not want to introduce cycles. Notice that the remaining PHI nodes
2775	// that we need to fix are reduction variables.
2776
2777	// Create the 'reduced' values for each of the induction vars.
2778	// The reduced values are the vector values that we scalarize and combine
2779	// after the loop is finished.
2780	for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
2781	it != e; ++it) {
2782	PHINode RdxPhi = it;
2783	assert(RdxPhi && "Unable to recover vectorized PHI")((RdxPhi && "Unable to recover vectorized PHI") ? static_cast <void> (0) : __assert_fail ("RdxPhi && \"Unable to recover vectorized PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2783, __PRETTY_FUNCTION__));
2784
2785	// Find the reduction variable descriptor.
2786	assert(Legal->getReductionVars()->count(RdxPhi) &&((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2787, __PRETTY_FUNCTION__))
2787	"Unable to find the reduction variable")((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2787, __PRETTY_FUNCTION__));
2788	LoopVectorizationLegality::ReductionDescriptor RdxDesc =
2789	(*Legal->getReductionVars())[RdxPhi];
2790
2791	setDebugLocFromInst(Builder, RdxDesc.StartValue);
2792
2793	// We need to generate a reduction vector from the incoming scalar.
2794	// To do so, we need to generate the 'identity' vector and override
2795	// one of the elements with the incoming scalar reduction. We need
2796	// to do it in the vector-loop preheader.
2797	Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
2798
2799	// This is the vector-clone of the value that leaves the loop.
2800	VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
2801	Type *VecTy = VectorExit[0]->getType();
2802
2803	// Find the reduction identity variable. Zero for addition, or, xor,
2804	// one for multiplication, -1 for And.
2805	Value *Identity;
2806	Value *VectorStart;
2807	if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax \|\|
2808	RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
2809	// MinMax reduction have the start value as their identify.
2810	if (VF == 1) {
2811	VectorStart = Identity = RdxDesc.StartValue;
2812	} else {
2813	VectorStart = Identity = Builder.CreateVectorSplat(VF,
2814	RdxDesc.StartValue,
2815	"minmax.ident");
2816	}
2817	} else {
2818	// Handle other reduction kinds:
2819	Constant *Iden =
2820	LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
2821	VecTy->getScalarType());
2822	if (VF == 1) {
2823	Identity = Iden;
2824	// This vector is the Identity vector where the first element is the
2825	// incoming scalar reduction.
2826	VectorStart = RdxDesc.StartValue;
2827	} else {
2828	Identity = ConstantVector::getSplat(VF, Iden);
2829
2830	// This vector is the Identity vector where the first element is the
2831	// incoming scalar reduction.
2832	VectorStart = Builder.CreateInsertElement(Identity,
2833	RdxDesc.StartValue, Zero);
2834	}
2835	}
2836
2837	// Fix the vector-loop phi.
2838	// We created the induction variable so we know that the
2839	// preheader is the first entry.
2840	BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
2841
2842	// Reductions do not have to start at zero. They can start with
2843	// any loop invariant values.
2844	VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
2845	BasicBlock *Latch = OrigLoop->getLoopLatch();
2846	Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
2847	VectorParts &Val = getVectorValue(LoopVal);
2848	for (unsigned part = 0; part < UF; ++part) {
2849	// Make sure to add the reduction stat value only to the
2850	// first unroll part.
2851	Value *StartVal = (part == 0) ? VectorStart : Identity;
2852	cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
2853	cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
2854	LoopVectorBody.back());
2855	}
2856
2857	// Before each round, move the insertion point right between
2858	// the PHIs and the values we are going to write.
2859	// This allows us to write both PHINodes and the extractelement
2860	// instructions.
2861	Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
2862
2863	VectorParts RdxParts;
2864	setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
2865	for (unsigned part = 0; part < UF; ++part) {
2866	// This PHINode contains the vectorized reduction variable, or
2867	// the initial value vector, if we bypass the vector loop.
2868	VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
2869	PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
2870	Value *StartVal = (part == 0) ? VectorStart : Identity;
2871	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2872	NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
2873	NewPhi->addIncoming(RdxExitVal[part],
2874	LoopVectorBody.back());
2875	RdxParts.push_back(NewPhi);
2876	}
2877
2878	// Reduce all of the unrolled parts into a single vector.
2879	Value *ReducedPartRdx = RdxParts[0];
2880	unsigned Op = getReductionBinOp(RdxDesc.Kind);
2881	setDebugLocFromInst(Builder, ReducedPartRdx);
2882	for (unsigned part = 1; part < UF; ++part) {
2883	if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2884	// Floating point operations had to be 'fast' to enable the reduction.
2885	ReducedPartRdx = addFastMathFlag(
2886	Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
2887	ReducedPartRdx, "bin.rdx"));
2888	else
2889	ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
2890	ReducedPartRdx, RdxParts[part]);
2891	}
2892
2893	if (VF > 1) {
2894	// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
2895	// and vector ops, reducing the set of values being computed by half each
2896	// round.
2897	assert(isPowerOf2_32(VF) &&((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2898, __PRETTY_FUNCTION__))
2898	"Reduction emission only supported for pow2 vectors!")((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2898, __PRETTY_FUNCTION__));
2899	Value *TmpVec = ReducedPartRdx;
2900	SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
2901	for (unsigned i = VF; i != 1; i >>= 1) {
2902	// Move the upper half of the vector to the lower half.
2903	for (unsigned j = 0; j != i/2; ++j)
2904	ShuffleMask[j] = Builder.getInt32(i/2 + j);
2905
2906	// Fill the rest of the mask with undef.
2907	std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
2908	UndefValue::get(Builder.getInt32Ty()));
2909
2910	Value *Shuf =
2911	Builder.CreateShuffleVector(TmpVec,
2912	UndefValue::get(TmpVec->getType()),
2913	ConstantVector::get(ShuffleMask),
2914	"rdx.shuf");
2915
2916	if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2917	// Floating point operations had to be 'fast' to enable the reduction.
2918	TmpVec = addFastMathFlag(Builder.CreateBinOp(
2919	(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
2920	else
2921	TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
2922	}
2923
2924	// The result is in the first element of the vector.
2925	ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
2926	Builder.getInt32(0));
2927	}
2928
2929	// Create a phi node that merges control-flow from the backedge-taken check
2930	// block and the middle block.
2931	PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
2932	LoopScalarPreHeader->getTerminator());
2933	BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
2934	BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
2935
2936	// Now, we need to fix the users of the reduction variable
2937	// inside and outside of the scalar remainder loop.
2938	// We know that the loop is in LCSSA form. We need to update the
2939	// PHI nodes in the exit blocks.
2940	for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2941	LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2942	PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2943	if (!LCSSAPhi) break;
2944
2945	// All PHINodes need to have a single entry edge, or two if
2946	// we already fixed them.
2947	assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI" ) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi->getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2947, __PRETTY_FUNCTION__));
2948
2949	// We found our reduction value exit-PHI. Update it with the
2950	// incoming bypass edge.
2951	if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
2952	// Add an edge coming from the bypass.
2953	LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
2954	break;
2955	}
2956	}// end of the LCSSA phi scan.
2957
2958	// Fix the scalar loop reduction variable with the incoming reduction sum
2959	// from the vector body and from the backedge value.
2960	int IncomingEdgeBlockIdx =
2961	(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
2962	assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index" ) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2962, __PRETTY_FUNCTION__));
2963	// Pick the other block.
2964	int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
2965	(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
2966	(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
2967	}// end of for each redux variable.
2968
2969	fixLCSSAPHIs();
2970
2971	// Remove redundant induction instructions.
2972	cse(LoopVectorBody);
2973	}
2974
2975	void InnerLoopVectorizer::fixLCSSAPHIs() {
2976	for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2977	LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2978	PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2979	if (!LCSSAPhi) break;
2980	if (LCSSAPhi->getNumIncomingValues() == 1)
2981	LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
2982	LoopMiddleBlock);
2983	}
2984	}
2985
2986	InnerLoopVectorizer::VectorParts
2987	InnerLoopVectorizer::createEdgeMask(BasicBlock Src, BasicBlock Dst) {
2988	assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2989, __PRETTY_FUNCTION__))
2989	"Invalid edge")((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2989, __PRETTY_FUNCTION__));
2990
2991	// Look for cached value.
2992	std::pair<BasicBlock, BasicBlock> Edge(Src, Dst);
2993	EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
2994	if (ECEntryIt != MaskCache.end())
2995	return ECEntryIt->second;
2996
2997	VectorParts SrcMask = createBlockInMask(Src);
2998
2999	// The terminator has to be a branch inst!
3000	BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
3001	assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast< void> (0) : __assert_fail ("BI && \"Unexpected terminator found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3001, __PRETTY_FUNCTION__));
3002
3003	if (BI->isConditional()) {
3004	VectorParts EdgeMask = getVectorValue(BI->getCondition());
3005
3006	if (BI->getSuccessor(0) != Dst)
3007	for (unsigned part = 0; part < UF; ++part)
3008	EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
3009
3010	for (unsigned part = 0; part < UF; ++part)
3011	EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
3012
3013	MaskCache[Edge] = EdgeMask;
3014	return EdgeMask;
3015	}
3016
3017	MaskCache[Edge] = SrcMask;
3018	return SrcMask;
3019	}
3020
3021	InnerLoopVectorizer::VectorParts
3022	InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
3023	assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop" ) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3023, __PRETTY_FUNCTION__));
3024
3025	// Loop incoming mask is all-one.
3026	if (OrigLoop->getHeader() == BB) {
3027	Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
3028	return getVectorValue(C);
3029	}
3030
3031	// This is the block mask. We OR all incoming edges, and with zero.
3032	Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
3033	VectorParts BlockMask = getVectorValue(Zero);
3034
3035	// For each pred:
3036	for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
3037	VectorParts EM = createEdgeMask(*it, BB);
3038	for (unsigned part = 0; part < UF; ++part)
3039	BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
3040	}
3041
3042	return BlockMask;
3043	}
3044
3045	void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
3046	InnerLoopVectorizer::VectorParts &Entry,
3047	unsigned UF, unsigned VF, PhiVector *PV) {
3048	PHINode* P = cast<PHINode>(PN);
3049	// Handle reduction variables:
3050	if (Legal->getReductionVars()->count(P)) {
3051	for (unsigned part = 0; part < UF; ++part) {
3052	// This is phase one of vectorizing PHIs.
3053	Type *VecTy = (VF == 1) ? PN->getType() :
3054	VectorType::get(PN->getType(), VF);
3055	Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
3056	LoopVectorBody.back()-> getFirstInsertionPt());
3057	}
3058	PV->push_back(P);
3059	return;
3060	}
3061
3062	setDebugLocFromInst(Builder, P);
3063	// Check for PHI nodes that are lowered to vector selects.
3064	if (P->getParent() != OrigLoop->getHeader()) {
3065	// We know that all PHIs in non-header blocks are converted into
3066	// selects, so we don't have to worry about the insertion order and we
3067	// can just use the builder.
3068	// At this point we generate the predication tree. There may be
3069	// duplications since this is a simple recursive scan, but future
3070	// optimizations will clean it up.
3071
3072	unsigned NumIncoming = P->getNumIncomingValues();
3073
3074	// Generate a sequence of selects of the form:
3075	// SELECT(Mask3, In3,
3076	// SELECT(Mask2, In2,
3077	// ( ...)))
3078	for (unsigned In = 0; In < NumIncoming; In++) {
3079	VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
3080	P->getParent());
3081	VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
3082
3083	for (unsigned part = 0; part < UF; ++part) {
3084	// We might have single edge PHIs (blocks) - use an identity
3085	// 'select' for the first PHI operand.
3086	if (In == 0)
3087	Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3088	In0[part]);
3089	else
3090	// Select between the current value and the previous incoming edge
3091	// based on the incoming mask.
3092	Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3093	Entry[part], "predphi");
3094	}
3095	}
3096	return;
3097	}
3098
3099	// This PHINode must be an induction variable.
3100	// Make sure that we know about it.
3101	assert(Legal->getInductionVars()->count(P) &&((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3102, __PRETTY_FUNCTION__))
3102	"Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3102, __PRETTY_FUNCTION__));
3103
3104	LoopVectorizationLegality::InductionInfo II =
3105	Legal->getInductionVars()->lookup(P);
3106
3107	switch (II.IK) {
3108	case LoopVectorizationLegality::IK_NoInduction:
3109	llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3109);
3110	case LoopVectorizationLegality::IK_IntInduction: {
3111	assert(P->getType() == II.StartValue->getType() && "Types must match")((P->getType() == II.StartValue->getType() && "Types must match" ) ? static_cast<void> (0) : __assert_fail ("P->getType() == II.StartValue->getType() && \"Types must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3111, __PRETTY_FUNCTION__));
3112	Type *PhiTy = P->getType();
3113	Value *Broadcasted;
3114	if (P == OldInduction) {
3115	// Handle the canonical induction variable. We might have had to
3116	// extend the type.
3117	Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
3118	} else {
3119	// Handle other induction variables that are now based on the
3120	// canonical one.
3121	Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
3122	"normalized.idx");
3123	NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
3124	Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
3125	"offset.idx");
3126	}
3127	Broadcasted = getBroadcastInstrs(Broadcasted);
3128	// After broadcasting the induction variable we need to make the vector
3129	// consecutive by adding 0, 1, 2, etc.
3130	for (unsigned part = 0; part < UF; ++part)
3131	Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
3132	return;
3133	}
3134	case LoopVectorizationLegality::IK_ReverseIntInduction:
3135	case LoopVectorizationLegality::IK_PtrInduction:
3136	case LoopVectorizationLegality::IK_ReversePtrInduction:
3137	// Handle reverse integer and pointer inductions.
3138	Value *StartIdx = ExtendedIdx;
3139	// This is the normalized GEP that starts counting at zero.
3140	Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
3141	"normalized.idx");
3142
3143	// Handle the reverse integer induction variable case.
3144	if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
3145	IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
3146	Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
3147	"resize.norm.idx");
3148	Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
3149	"reverse.idx");
3150
3151	// This is a new value so do not hoist it out.
3152	Value *Broadcasted = getBroadcastInstrs(ReverseInd);
3153	// After broadcasting the induction variable we need to make the
3154	// vector consecutive by adding ... -3, -2, -1, 0.
3155	for (unsigned part = 0; part < UF; ++part)
3156	Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
3157	true);
3158	return;
3159	}
3160
3161	// Handle the pointer induction variable case.
3162	assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type." ) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3162, __PRETTY_FUNCTION__));
3163
3164	// Is this a reverse induction ptr or a consecutive induction ptr.
3165	bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
3166	II.IK);
3167
3168	// This is the vector of results. Notice that we don't generate
3169	// vector geps because scalar geps result in better code.
3170	for (unsigned part = 0; part < UF; ++part) {
3171	if (VF == 1) {
3172	int EltIndex = (part) * (Reverse ? -1 : 1);
3173	Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
3174	Value *GlobalIdx;
3175	if (Reverse)
3176	GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
3177	else
3178	GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
3179
3180	Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
3181	"next.gep");
3182	Entry[part] = SclrGep;
3183	continue;
3184	}
3185
3186	Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
3187	for (unsigned int i = 0; i < VF; ++i) {
3188	int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
3189	Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
3190	Value *GlobalIdx;
3191	if (!Reverse)
3192	GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
3193	else
3194	GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
3195
3196	Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
3197	"next.gep");
3198	VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
3199	Builder.getInt32(i),
3200	"insert.gep");
3201	}
3202	Entry[part] = VecVal;
3203	}
3204	return;
3205	}
3206	}
3207
3208	void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock BB, PhiVector PV) {
3209	// For each instruction in the old loop.
3210	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3211	VectorParts &Entry = WidenMap.get(it);
3212	switch (it->getOpcode()) {
3213	case Instruction::Br:
3214	// Nothing to do for PHIs and BR, since we already took care of the
3215	// loop control flow instructions.
3216	continue;
3217	case Instruction::PHI:{
3218	// Vectorize PHINodes.
3219	widenPHIInstruction(it, Entry, UF, VF, PV);
3220	continue;
3221	}// End of PHI.
3222
3223	case Instruction::Add:
3224	case Instruction::FAdd:
3225	case Instruction::Sub:
3226	case Instruction::FSub:
3227	case Instruction::Mul:
3228	case Instruction::FMul:
3229	case Instruction::UDiv:
3230	case Instruction::SDiv:
3231	case Instruction::FDiv:
3232	case Instruction::URem:
3233	case Instruction::SRem:
3234	case Instruction::FRem:
3235	case Instruction::Shl:
3236	case Instruction::LShr:
3237	case Instruction::AShr:
3238	case Instruction::And:
3239	case Instruction::Or:
3240	case Instruction::Xor: {
3241	// Just widen binops.
3242	BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
3243	setDebugLocFromInst(Builder, BinOp);
3244	VectorParts &A = getVectorValue(it->getOperand(0));
3245	VectorParts &B = getVectorValue(it->getOperand(1));
3246
3247	// Use this vector value for all users of the original instruction.
3248	for (unsigned Part = 0; Part < UF; ++Part) {
3249	Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
3250
3251	if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3252	VecOp->copyIRFlags(BinOp);
3253
3254	Entry[Part] = V;
3255	}
3256
3257	propagateMetadata(Entry, it);
3258	break;
3259	}
3260	case Instruction::Select: {
3261	// Widen selects.
3262	// If the selector is loop invariant we can create a select
3263	// instruction with a scalar condition. Otherwise, use vector-select.
3264	bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
3265	OrigLoop);
3266	setDebugLocFromInst(Builder, it);
3267
3268	// The condition can be loop invariant but still defined inside the
3269	// loop. This means that we can't just use the original 'cond' value.
3270	// We have to take the 'vectorized' value and pick the first lane.
3271	// Instcombine will make this a no-op.
3272	VectorParts &Cond = getVectorValue(it->getOperand(0));
3273	VectorParts &Op0 = getVectorValue(it->getOperand(1));
3274	VectorParts &Op1 = getVectorValue(it->getOperand(2));
3275
3276	Value *ScalarCond = (VF == 1) ? Cond[0] :
3277	Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
3278
3279	for (unsigned Part = 0; Part < UF; ++Part) {
3280	Entry[Part] = Builder.CreateSelect(
3281	InvariantCond ? ScalarCond : Cond[Part],
3282	Op0[Part],
3283	Op1[Part]);
3284	}
3285
3286	propagateMetadata(Entry, it);
3287	break;
3288	}
3289
3290	case Instruction::ICmp:
3291	case Instruction::FCmp: {
3292	// Widen compares. Generate vector compares.
3293	bool FCmp = (it->getOpcode() == Instruction::FCmp);
3294	CmpInst *Cmp = dyn_cast<CmpInst>(it);
3295	setDebugLocFromInst(Builder, it);
3296	VectorParts &A = getVectorValue(it->getOperand(0));
3297	VectorParts &B = getVectorValue(it->getOperand(1));
3298	for (unsigned Part = 0; Part < UF; ++Part) {
3299	Value *C = nullptr;
3300	if (FCmp)
3301	C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
3302	else
3303	C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
3304	Entry[Part] = C;
3305	}
3306
3307	propagateMetadata(Entry, it);
3308	break;
3309	}
3310
3311	case Instruction::Store:
3312	case Instruction::Load:
3313	vectorizeMemoryInstruction(it);
3314	break;
3315	case Instruction::ZExt:
3316	case Instruction::SExt:
3317	case Instruction::FPToUI:
3318	case Instruction::FPToSI:
3319	case Instruction::FPExt:
3320	case Instruction::PtrToInt:
3321	case Instruction::IntToPtr:
3322	case Instruction::SIToFP:
3323	case Instruction::UIToFP:
3324	case Instruction::Trunc:
3325	case Instruction::FPTrunc:
3326	case Instruction::BitCast: {
3327	CastInst *CI = dyn_cast<CastInst>(it);
3328	setDebugLocFromInst(Builder, it);
3329	/// Optimize the special case where the source is the induction
3330	/// variable. Notice that we can only optimize the 'trunc' case
3331	/// because: a. FP conversions lose precision, b. sext/zext may wrap,
3332	/// c. other casts depend on pointer size.
3333	if (CI->getOperand(0) == OldInduction &&
3334	it->getOpcode() == Instruction::Trunc) {
3335	Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
3336	CI->getType());
3337	Value *Broadcasted = getBroadcastInstrs(ScalarCast);
3338	for (unsigned Part = 0; Part < UF; ++Part)
3339	Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
3340	propagateMetadata(Entry, it);
3341	break;
3342	}
3343	/// Vectorize casts.
3344	Type *DestTy = (VF == 1) ? CI->getType() :
3345	VectorType::get(CI->getType(), VF);
3346
3347	VectorParts &A = getVectorValue(it->getOperand(0));
3348	for (unsigned Part = 0; Part < UF; ++Part)
3349	Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
3350	propagateMetadata(Entry, it);
3351	break;
3352	}
3353
3354	case Instruction::Call: {
3355	// Ignore dbg intrinsics.
3356	if (isa<DbgInfoIntrinsic>(it))
3357	break;
3358	setDebugLocFromInst(Builder, it);
3359
3360	Module *M = BB->getParent()->getParent();
3361	CallInst *CI = cast<CallInst>(it);
3362	Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
3363	assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3363, __PRETTY_FUNCTION__));
3364	switch (ID) {
3365	case Intrinsic::assume:
3366	case Intrinsic::lifetime_end:
3367	case Intrinsic::lifetime_start:
3368	scalarizeInstruction(it);
3369	break;
3370	default:
3371	bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
3372	for (unsigned Part = 0; Part < UF; ++Part) {
3373	SmallVector<Value *, 4> Args;
3374	for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
3375	if (HasScalarOpd && i == 1) {
3376	Args.push_back(CI->getArgOperand(i));
3377	continue;
3378	}
3379	VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
3380	Args.push_back(Arg[Part]);
3381	}
3382	Type *Tys[] = {CI->getType()};
3383	if (VF > 1)
3384	Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
3385
3386	Function *F = Intrinsic::getDeclaration(M, ID, Tys);
3387	Entry[Part] = Builder.CreateCall(F, Args);
3388	}
3389
3390	propagateMetadata(Entry, it);
3391	break;
3392	}
3393	break;
3394	}
3395
3396	default:
3397	// All other instructions are unsupported. Scalarize them.
3398	scalarizeInstruction(it);
3399	break;
3400	}// end of switch.
3401	}// end of for_each instr.
3402	}
3403
3404	void InnerLoopVectorizer::updateAnalysis() {
3405	// Forget the original basic block.
3406	SE->forgetLoop(OrigLoop);
3407
3408	// Update the dominator tree information.
3409	assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3410, __PRETTY_FUNCTION__))
3410	"Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3410, __PRETTY_FUNCTION__));
3411
3412	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
3413	DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
3414	DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
3415
3416	// Due to if predication of stores we might create a sequence of "if(pred)
3417	// a[i] = ...; " blocks.
3418	for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
3419	if (i == 0)
3420	DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
3421	else if (isPredicatedBlock(i)) {
3422	DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
3423	} else {
3424	DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
3425	}
3426	}
3427
3428	DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
3429	DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
3430	DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
3431	DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
3432
3433	DEBUG(DT->verifyDomTree())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { DT->verifyDomTree(); } } while (0);
3434	}
3435
3436	/// \brief Check whether it is safe to if-convert this phi node.
3437	///
3438	/// Phi nodes with constant expressions that can trap are not safe to if
3439	/// convert.
3440	static bool canIfConvertPHINodes(BasicBlock *BB) {
3441	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3442	PHINode *Phi = dyn_cast<PHINode>(I);
3443	if (!Phi)
3444	return true;
3445	for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
3446	if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
3447	if (C->canTrap())
3448	return false;
3449	}
3450	return true;
3451	}
3452
3453	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
3454	if (!EnableIfConversion) {
3455	emitAnalysis(Report() << "if-conversion is disabled");
3456	return false;
3457	}
3458
3459	assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable")((TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable" ) ? static_cast<void> (0) : __assert_fail ("TheLoop->getNumBlocks() > 1 && \"Single block loops are vectorizable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3459, __PRETTY_FUNCTION__));
3460
3461	// A list of pointers that we can safely read and write to.
3462	SmallPtrSet<Value *, 8> SafePointes;
3463
3464	// Collect safe addresses.
3465	for (Loop::block_iterator BI = TheLoop->block_begin(),
3466	BE = TheLoop->block_end(); BI != BE; ++BI) {
3467	BasicBlock BB = BI;
3468
3469	if (blockNeedsPredication(BB))
3470	continue;
3471
3472	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3473	if (LoadInst *LI = dyn_cast<LoadInst>(I))
3474	SafePointes.insert(LI->getPointerOperand());
3475	else if (StoreInst *SI = dyn_cast<StoreInst>(I))
3476	SafePointes.insert(SI->getPointerOperand());
3477	}
3478	}
3479
3480	// Collect the blocks that need predication.
3481	BasicBlock *Header = TheLoop->getHeader();
3482	for (Loop::block_iterator BI = TheLoop->block_begin(),
3483	BE = TheLoop->block_end(); BI != BE; ++BI) {
3484	BasicBlock BB = BI;
3485
3486	// We don't support switch statements inside loops.
3487	if (!isa<BranchInst>(BB->getTerminator())) {
3488	emitAnalysis(Report(BB->getTerminator())
3489	<< "loop contains a switch statement");
3490	return false;
3491	}
3492
3493	// We must be able to predicate all blocks that need to be predicated.
3494	if (blockNeedsPredication(BB)) {
3495	if (!blockCanBePredicated(BB, SafePointes)) {
3496	emitAnalysis(Report(BB->getTerminator())
3497	<< "control flow cannot be substituted for a select");
3498	return false;
3499	}
3500	} else if (BB != Header && !canIfConvertPHINodes(BB)) {
3501	emitAnalysis(Report(BB->getTerminator())
3502	<< "control flow cannot be substituted for a select");
3503	return false;
3504	}
3505	}
3506
3507	// We can if-convert this loop.
3508	return true;
3509	}
3510
3511	bool LoopVectorizationLegality::canVectorize() {
3512	// We must have a loop in canonical form. Loops with indirectbr in them cannot
3513	// be canonicalized.
3514	if (!TheLoop->getLoopPreheader()) {
3515	emitAnalysis(
3516	Report() << "loop control flow is not understood by vectorizer");
3517	return false;
3518	}
3519
3520	// We can only vectorize innermost loops.
3521	if (TheLoop->getSubLoopsVector().size()) {
3522	emitAnalysis(Report() << "loop is not the innermost loop");
3523	return false;
3524	}
3525
3526	// We must have a single backedge.
3527	if (TheLoop->getNumBackEdges() != 1) {
3528	emitAnalysis(
3529	Report() << "loop control flow is not understood by vectorizer");
3530	return false;
3531	}
3532
3533	// We must have a single exiting block.
3534	if (!TheLoop->getExitingBlock()) {
3535	emitAnalysis(
3536	Report() << "loop control flow is not understood by vectorizer");
3537	return false;
3538	}
3539
3540	// We need to have a loop header.
3541	DEBUG(dbgs() << "LV: Found a loop: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0)
3542	TheLoop->getHeader()->getName() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0);
3543
3544	// Check if we can if-convert non-single-bb loops.
3545	unsigned NumBlocks = TheLoop->getNumBlocks();
3546	if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
3547	DEBUG(dbgs() << "LV: Can't if-convert the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't if-convert the loop.\n" ; } } while (0);
3548	return false;
3549	}
3550
3551	// ScalarEvolution needs to be able to find the exit count.
3552	const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
3553	if (ExitCount == SE->getCouldNotCompute()) {
3554	emitAnalysis(Report() << "could not determine number of loop iterations");
3555	DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: SCEV could not compute the loop exit count.\n" ; } } while (0);
3556	return false;
3557	}
3558
3559	// Check if we can vectorize the instructions and CFG in this loop.
3560	if (!canVectorizeInstrs()) {
3561	DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize the instructions or CFG\n" ; } } while (0);
3562	return false;
3563	}
3564
3565	// Go over each instruction and look at memory deps.
3566	if (!canVectorizeMemory()) {
3567	DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize due to memory conflicts\n" ; } } while (0);
3568	return false;
3569	}
3570
3571	// Collect all of the variables that remain uniform after vectorization.
3572	collectLoopUniforms();
3573
3574	DEBUG(dbgs() << "LV: We can vectorize this loop" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0)
3575	(PtrRtCheck.Need ? " (with a runtime bound check)" : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0)
3576	<<"!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0);
3577
3578	// Okay! We can vectorize. At this point we don't have any other mem analysis
3579	// which may limit our maximum vectorization factor, so just return true with
3580	// no restrictions.
3581	return true;
3582	}
3583
3584	static Type convertPointerToIntegerType(const DataLayout &DL, Type Ty) {
3585	if (Ty->isPointerTy())
3586	return DL.getIntPtrType(Ty);
3587
3588	// It is possible that char's or short's overflow when we ask for the loop's
3589	// trip count, work around this by changing the type size.
3590	if (Ty->getScalarSizeInBits() < 32)
3591	return Type::getInt32Ty(Ty->getContext());
3592
3593	return Ty;
3594	}
3595
3596	static Type* getWiderType(const DataLayout &DL, Type Ty0, Type Ty1) {
3597	Ty0 = convertPointerToIntegerType(DL, Ty0);
3598	Ty1 = convertPointerToIntegerType(DL, Ty1);
3599	if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
3600	return Ty0;
3601	return Ty1;
3602	}
3603
3604	/// \brief Check that the instruction has outside loop users and is not an
3605	/// identified reduction variable.
3606	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
3607	SmallPtrSetImpl<Value *> &Reductions) {
3608	// Reduction instructions are allowed to have exit users. All other
3609	// instructions must not have external users.
3610	if (!Reductions.count(Inst))
3611	//Check that all of the users of the loop are inside the BB.
3612	for (User *U : Inst->users()) {
3613	Instruction *UI = cast<Instruction>(U);
3614	// This user may be a reduction exit value.
3615	if (!TheLoop->contains(UI)) {
3616	DEBUG(dbgs() << "LV: Found an outside user for : " << UI << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an outside user for : " << UI << '\n'; } } while (0);
3617	return true;
3618	}
3619	}
3620	return false;
3621	}
3622
3623	bool LoopVectorizationLegality::canVectorizeInstrs() {
3624	BasicBlock *PreHeader = TheLoop->getLoopPreheader();
3625	BasicBlock *Header = TheLoop->getHeader();
3626
3627	// Look for the attribute signaling the absence of NaNs.
3628	Function &F = *Header->getParent();
3629	if (F.hasFnAttribute("no-nans-fp-math"))
3630	HasFunNoNaNAttr = F.getAttributes().getAttribute(
3631	AttributeSet::FunctionIndex,
3632	"no-nans-fp-math").getValueAsString() == "true";
3633
3634	// For each block in the loop.
3635	for (Loop::block_iterator bb = TheLoop->block_begin(),
3636	be = TheLoop->block_end(); bb != be; ++bb) {
3637
3638	// Scan the instructions in the block and look for hazards.
3639	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
3640	++it) {
3641
3642	if (PHINode *Phi = dyn_cast<PHINode>(it)) {
3643	Type *PhiTy = Phi->getType();
3644	// Check that this PHI type is allowed.
3645	if (!PhiTy->isIntegerTy() &&
3646	!PhiTy->isFloatingPointTy() &&
3647	!PhiTy->isPointerTy()) {
3648	emitAnalysis(Report(it)
3649	<< "loop control flow is not understood by vectorizer");
3650	DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an non-int non-pointer PHI.\n" ; } } while (0);
3651	return false;
3652	}
3653
3654	// If this PHINode is not in the header block, then we know that we
3655	// can convert it to select during if-conversion. No need to check if
3656	// the PHIs in this block are induction or reduction variables.
3657	if (*bb != Header) {
3658	// Check that this instruction has no outside users or is an
3659	// identified reduction value with an outside user.
3660	if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
3661	continue;
3662	emitAnalysis(Report(it) << "value could not be identified as "
3663	"an induction or reduction variable");
3664	return false;
3665	}
3666
3667	// We only allow if-converted PHIs with more than two incoming values.
3668	if (Phi->getNumIncomingValues() != 2) {
3669	emitAnalysis(Report(it)
3670	<< "control flow not understood by vectorizer");
3671	DEBUG(dbgs() << "LV: Found an invalid PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an invalid PHI.\n" ; } } while (0);
3672	return false;
3673	}
3674
3675	// This is the value coming from the preheader.
3676	Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
3677	// Check if this is an induction variable.
3678	InductionKind IK = isInductionVariable(Phi);
3679
3680	if (IK_NoInduction != IK) {
3681	// Get the widest type.
3682	if (!WidestIndTy)
3683	WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
3684	else
3685	WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
3686
3687	// Int inductions are special because we only allow one IV.
3688	if (IK == IK_IntInduction) {
3689	// Use the phi node with the widest type as induction. Use the last
3690	// one if there are multiple (no good reason for doing this other
3691	// than it is expedient).
3692	if (!Induction \|\| PhiTy == WidestIndTy)
3693	Induction = Phi;
3694	}
3695
3696	DEBUG(dbgs() << "LV: Found an induction variable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an induction variable.\n" ; } } while (0);
3697	Inductions[Phi] = InductionInfo(StartValue, IK);
3698
3699	// Until we explicitly handle the case of an induction variable with
3700	// an outside loop user we have to give up vectorizing this loop.
3701	if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
3702	emitAnalysis(Report(it) << "use of induction value outside of the "
3703	"loop is not handled by vectorizer");
3704	return false;
3705	}
3706
3707	continue;
3708	}
3709
3710	if (AddReductionVar(Phi, RK_IntegerAdd)) {
3711	DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an ADD reduction PHI." << Phi <<"\n"; } } while (0);
3712	continue;
3713	}
3714	if (AddReductionVar(Phi, RK_IntegerMult)) {
3715	DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MUL reduction PHI." << Phi <<"\n"; } } while (0);
3716	continue;
3717	}
3718	if (AddReductionVar(Phi, RK_IntegerOr)) {
3719	DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an OR reduction PHI." << Phi <<"\n"; } } while (0);
3720	continue;
3721	}
3722	if (AddReductionVar(Phi, RK_IntegerAnd)) {
3723	DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an AND reduction PHI." << Phi <<"\n"; } } while (0);
3724	continue;
3725	}
3726	if (AddReductionVar(Phi, RK_IntegerXor)) {
3727	DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a XOR reduction PHI." << Phi <<"\n"; } } while (0);
3728	continue;
3729	}
3730	if (AddReductionVar(Phi, RK_IntegerMinMax)) {
3731	DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MINMAX reduction PHI." << Phi <<"\n"; } } while (0);
3732	continue;
3733	}
3734	if (AddReductionVar(Phi, RK_FloatMult)) {
3735	DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FMult reduction PHI." << Phi <<"\n"; } } while (0);
3736	continue;
3737	}
3738	if (AddReductionVar(Phi, RK_FloatAdd)) {
3739	DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FAdd reduction PHI." << Phi <<"\n"; } } while (0);
3740	continue;
3741	}
3742	if (AddReductionVar(Phi, RK_FloatMinMax)) {
3743	DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< Phi <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << Phi << "\n"; } } while (0)
3744	"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0);
3745	continue;
3746	}
3747
3748	emitAnalysis(Report(it) << "value that could not be identified as "
3749	"reduction is used outside the loop");
3750	DEBUG(dbgs() << "LV: Found an unidentified PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an unidentified PHI." << Phi <<"\n"; } } while (0);
3751	return false;
3752	}// end of PHI handling
3753
3754	// We still don't handle functions. However, we can ignore dbg intrinsic
3755	// calls and we do handle certain intrinsic and libm functions.
3756	CallInst *CI = dyn_cast<CallInst>(it);
3757	if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
3758	emitAnalysis(Report(it) << "call instruction cannot be vectorized");
3759	DEBUG(dbgs() << "LV: Found a call site.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a call site.\n" ; } } while (0);
3760	return false;
3761	}
3762
3763	// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
3764	// second argument is the same (i.e. loop invariant)
3765	if (CI &&
3766	hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
3767	if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
3768	emitAnalysis(Report(it)
3769	<< "intrinsic instruction cannot be vectorized");
3770	DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable intrinsic " << CI << "\n"; } } while (0);
3771	return false;
3772	}
3773	}
3774
3775	// Check that the instruction return type is vectorizable.
3776	// Also, we can't vectorize extractelement instructions.
3777	if ((!VectorType::isValidElementType(it->getType()) &&
3778	!it->getType()->isVoidTy()) \|\| isa<ExtractElementInst>(it)) {
3779	emitAnalysis(Report(it)
3780	<< "instruction return type cannot be vectorized");
3781	DEBUG(dbgs() << "LV: Found unvectorizable type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable type.\n" ; } } while (0);
3782	return false;
3783	}
3784
3785	// Check that the stored type is vectorizable.
3786	if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
3787	Type *T = ST->getValueOperand()->getType();
3788	if (!VectorType::isValidElementType(T)) {
3789	emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
3790	return false;
3791	}
3792	if (EnableMemAccessVersioning)
3793	collectStridedAcccess(ST);
3794	}
3795
3796	if (EnableMemAccessVersioning)
3797	if (LoadInst *LI = dyn_cast<LoadInst>(it))
3798	collectStridedAcccess(LI);
3799
3800	// Reduction instructions are allowed to have exit users.
3801	// All other instructions must not have external users.
3802	if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
3803	emitAnalysis(Report(it) << "value cannot be used outside the loop");
3804	return false;
3805	}
3806
3807	} // next instr.
3808
3809	}
3810
3811	if (!Induction) {
3812	DEBUG(dbgs() << "LV: Did not find one integer induction var.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Did not find one integer induction var.\n" ; } } while (0);
3813	if (Inductions.empty()) {
3814	emitAnalysis(Report()
3815	<< "loop induction variable could not be identified");
3816	return false;
3817	}
3818	}
3819
3820	return true;
3821	}
3822
3823	///\brief Remove GEPs whose indices but the last one are loop invariant and
3824	/// return the induction operand of the gep pointer.
3825	static Value stripGetElementPtr(Value Ptr, ScalarEvolution *SE,
3826	const DataLayout DL, Loop Lp) {
3827	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
3828	if (!GEP)
3829	return Ptr;
3830
3831	unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
3832
3833	// Check that all of the gep indices are uniform except for our induction
3834	// operand.
3835	for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
3836	if (i != InductionOperand &&
3837	!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
3838	return Ptr;
3839	return GEP->getOperand(InductionOperand);
3840	}
3841
3842	///\brief Look for a cast use of the passed value.
3843	static Value getUniqueCastUse(Value Ptr, Loop Lp, Type Ty) {
3844	Value *UniqueCast = nullptr;
3845	for (User *U : Ptr->users()) {
3846	CastInst *CI = dyn_cast<CastInst>(U);
3847	if (CI && CI->getType() == Ty) {
3848	if (!UniqueCast)
3849	UniqueCast = CI;
3850	else
3851	return nullptr;
3852	}
3853	}
3854	return UniqueCast;
3855	}
3856
3857	///\brief Get the stride of a pointer access in a loop.
3858	/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
3859	/// pointer to the Value, or null otherwise.
3860	static Value getStrideFromPointer(Value Ptr, ScalarEvolution *SE,
3861	const DataLayout DL, Loop Lp) {
3862	const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3863	if (!PtrTy \|\| PtrTy->isAggregateType())
3864	return nullptr;
3865
3866	// Try to remove a gep instruction to make the pointer (actually index at this
3867	// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
3868	// pointer, otherwise, we are analyzing the index.
3869	Value *OrigPtr = Ptr;
3870
3871	// The size of the pointer access.
3872	int64_t PtrAccessSize = 1;
3873
3874	Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
3875	const SCEV *V = SE->getSCEV(Ptr);
3876
3877	if (Ptr != OrigPtr)
3878	// Strip off casts.
3879	while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
3880	V = C->getOperand();
3881
3882	const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
3883	if (!S)
3884	return nullptr;
3885
3886	V = S->getStepRecurrence(*SE);
3887	if (!V)
3888	return nullptr;
3889
3890	// Strip off the size of access multiplication if we are still analyzing the
3891	// pointer.
3892	if (OrigPtr == Ptr) {
3893	DL->getTypeAllocSize(PtrTy->getElementType());
3894	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
3895	if (M->getOperand(0)->getSCEVType() != scConstant)
3896	return nullptr;
3897
3898	const APInt &APStepVal =
3899	cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
3900
3901	// Huge step value - give up.
3902	if (APStepVal.getBitWidth() > 64)
3903	return nullptr;
3904
3905	int64_t StepVal = APStepVal.getSExtValue();
3906	if (PtrAccessSize != StepVal)
3907	return nullptr;
3908	V = M->getOperand(1);
3909	}
3910	}
3911
3912	// Strip off casts.
3913	Type *StripedOffRecurrenceCast = nullptr;
3914	if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
3915	StripedOffRecurrenceCast = C->getType();
3916	V = C->getOperand();
3917	}
3918
3919	// Look for the loop invariant symbolic value.
3920	const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
3921	if (!U)
3922	return nullptr;
3923
3924	Value *Stride = U->getValue();
3925	if (!Lp->isLoopInvariant(Stride))
3926	return nullptr;
3927
3928	// If we have stripped off the recurrence cast we have to make sure that we
3929	// return the value that is used in this loop so that we can replace it later.
3930	if (StripedOffRecurrenceCast)
3931	Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
3932
3933	return Stride;
3934	}
3935
3936	void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
3937	Value *Ptr = nullptr;
3938	if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
3939	Ptr = LI->getPointerOperand();
3940	else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
3941	Ptr = SI->getPointerOperand();
3942	else
3943	return;
3944
3945	Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
3946	if (!Stride)
3947	return;
3948
3949	DEBUG(dbgs() << "LV: Found a strided access that we can version")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a strided access that we can version" ; } } while (0);
3950	DEBUG(dbgs() << " Ptr: " << Ptr << " Stride: " << Stride << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " Ptr: " << Ptr << " Stride: " << Stride << "\n"; } } while (0);
3951	Strides[Ptr] = Stride;
3952	StrideSet.insert(Stride);
3953	}
3954
3955	void LoopVectorizationLegality::collectLoopUniforms() {
3956	// We now know that the loop is vectorizable!
3957	// Collect variables that will remain uniform after vectorization.
3958	std::vector<Value*> Worklist;
3959	BasicBlock *Latch = TheLoop->getLoopLatch();
3960
3961	// Start with the conditional branch and walk up the block.
3962	Worklist.push_back(Latch->getTerminator()->getOperand(0));
3963
3964	// Also add all consecutive pointer values; these values will be uniform
3965	// after vectorization (and subsequent cleanup) and, until revectorization is
3966	// supported, all dependencies must also be uniform.
3967	for (Loop::block_iterator B = TheLoop->block_begin(),
3968	BE = TheLoop->block_end(); B != BE; ++B)
3969	for (BasicBlock::iterator I = (B)->begin(), IE = (B)->end();
3970	I != IE; ++I)
3971	if (I->getType()->isPointerTy() && isConsecutivePtr(I))
3972	Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
3973
3974	while (Worklist.size()) {
3975	Instruction *I = dyn_cast<Instruction>(Worklist.back());
3976	Worklist.pop_back();
3977
3978	// Look at instructions inside this loop.
3979	// Stop when reaching PHI nodes.
3980	// TODO: we need to follow values all over the loop, not only in this block.
3981	if (!I \|\| !TheLoop->contains(I) \|\| isa<PHINode>(I))
3982	continue;
3983
3984	// This is a known uniform.
3985	Uniforms.insert(I);
3986
3987	// Insert all operands.
3988	Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
3989	}
3990	}
3991
3992	namespace {
3993	/// \brief Analyses memory accesses in a loop.
3994	///
3995	/// Checks whether run time pointer checks are needed and builds sets for data
3996	/// dependence checking.
3997	class AccessAnalysis {
3998	public:
3999	/// \brief Read or write access location.
4000	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
4001	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
4002
4003	/// \brief Set of potential dependent memory accesses.
4004	typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
4005
4006	AccessAnalysis(const DataLayout Dl, AliasAnalysis AA, DepCandidates &DA) :
4007	DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
4008
4009	/// \brief Register a load and whether it is only read from.
4010	void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
4011	Value Ptr = const_cast<Value>(Loc.Ptr);
4012	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
4013	Accesses.insert(MemAccessInfo(Ptr, false));
4014	if (IsReadOnly)
4015	ReadOnlyPtr.insert(Ptr);
4016	}
4017
4018	/// \brief Register a store.
4019	void addStore(AliasAnalysis::Location &Loc) {
4020	Value Ptr = const_cast<Value>(Loc.Ptr);
4021	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
4022	Accesses.insert(MemAccessInfo(Ptr, true));
4023	}
4024
4025	/// \brief Check whether we can check the pointers at runtime for
4026	/// non-intersection.
4027	bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
4028	unsigned &NumComparisons, ScalarEvolution *SE,
4029	Loop *TheLoop, ValueToValueMap &Strides,
4030	bool ShouldCheckStride = false);
4031
4032	/// \brief Goes over all memory accesses, checks whether a RT check is needed
4033	/// and builds sets of dependent accesses.
4034	void buildDependenceSets() {
4035	processMemAccesses();
4036	}
4037
4038	bool isRTCheckNeeded() { return IsRTCheckNeeded; }
4039
4040	bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
4041	void resetDepChecks() { CheckDeps.clear(); }
4042
4043	MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
4044
4045	private:
4046	typedef SetVector<MemAccessInfo> PtrAccessSet;
4047
4048	/// \brief Go over all memory access and check whether runtime pointer checks
4049	/// are needed /// and build sets of dependency check candidates.
4050	void processMemAccesses();
4051
4052	/// Set of all accesses.
4053	PtrAccessSet Accesses;
4054
4055	/// Set of accesses that need a further dependence check.
4056	MemAccessInfoSet CheckDeps;
4057
4058	/// Set of pointers that are read only.
4059	SmallPtrSet<Value*, 16> ReadOnlyPtr;
4060
4061	const DataLayout *DL;
4062
4063	/// An alias set tracker to partition the access set by underlying object and
4064	//intrinsic property (such as TBAA metadata).
4065	AliasSetTracker AST;
4066
4067	/// Sets of potentially dependent accesses - members of one set share an
4068	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
4069	/// dependence check.
4070	DepCandidates &DepCands;
4071
4072	bool IsRTCheckNeeded;
4073	};
4074
4075	} // end anonymous namespace
4076
4077	/// \brief Check whether a pointer can participate in a runtime bounds check.
4078	static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
4079	Value *Ptr) {
4080	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
4081	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
4082	if (!AR)
4083	return false;
4084
4085	return AR->isAffine();
4086	}
4087
4088	/// \brief Check the stride of the pointer and ensure that it does not wrap in
4089	/// the address space.
4090	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
4091	const Loop *Lp, ValueToValueMap &StridesMap);
4092
4093	bool AccessAnalysis::canCheckPtrAtRT(
4094	LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
4095	unsigned &NumComparisons, ScalarEvolution SE, Loop TheLoop,
4096	ValueToValueMap &StridesMap, bool ShouldCheckStride) {
4097	// Find pointers with computable bounds. We are going to use this information
4098	// to place a runtime bound check.
4099	bool CanDoRT = true;
4100
4101	bool IsDepCheckNeeded = isDependencyCheckNeeded();
4102	NumComparisons = 0;
4103
4104	// We assign a consecutive id to access from different alias sets.
4105	// Accesses between different groups doesn't need to be checked.
4106	unsigned ASId = 1;
4107	for (auto &AS : AST) {
4108	unsigned NumReadPtrChecks = 0;
4109	unsigned NumWritePtrChecks = 0;
4110
4111	// We assign consecutive id to access from different dependence sets.
4112	// Accesses within the same set don't need a runtime check.
4113	unsigned RunningDepId = 1;
4114	DenseMap<Value *, unsigned> DepSetId;
4115
4116	for (auto A : AS) {
4117	Value *Ptr = A.getValue();
4118	bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
4119	MemAccessInfo Access(Ptr, IsWrite);
4120
4121	if (IsWrite)
4122	++NumWritePtrChecks;
4123	else
4124	++NumReadPtrChecks;
4125
4126	if (hasComputableBounds(SE, StridesMap, Ptr) &&
4127	// When we run after a failing dependency check we have to make sure we
4128	// don't have wrapping pointers.
4129	(!ShouldCheckStride \|\|
4130	isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
4131	// The id of the dependence set.
4132	unsigned DepId;
4133
4134	if (IsDepCheckNeeded) {
4135	Value *Leader = DepCands.getLeaderValue(Access).getPointer();
4136	unsigned &LeaderId = DepSetId[Leader];
4137	if (!LeaderId)
4138	LeaderId = RunningDepId++;
4139	DepId = LeaderId;
4140	} else
4141	// Each access has its own dependence set.
4142	DepId = RunningDepId++;
4143
4144	RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
4145
4146	DEBUG(dbgs() << "LV: Found a runtime check ptr:" << Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a runtime check ptr:" << Ptr << '\n'; } } while (0);
4147	} else {
4148	CanDoRT = false;
4149	}
4150	}
4151
4152	if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
4153	NumComparisons += 0; // Only one dependence set.
4154	else {
4155	NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
4156	NumWritePtrChecks - 1));
4157	}
4158
4159	++ASId;
4160	}
4161
4162	// If the pointers that we would use for the bounds comparison have different
4163	// address spaces, assume the values aren't directly comparable, so we can't
4164	// use them for the runtime check. We also have to assume they could
4165	// overlap. In the future there should be metadata for whether address spaces
4166	// are disjoint.
4167	unsigned NumPointers = RtCheck.Pointers.size();
4168	for (unsigned i = 0; i < NumPointers; ++i) {
4169	for (unsigned j = i + 1; j < NumPointers; ++j) {
4170	// Only need to check pointers between two different dependency sets.
4171	if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
4172	continue;
4173	// Only need to check pointers in the same alias set.
4174	if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
4175	continue;
4176
4177	Value *PtrI = RtCheck.Pointers[i];
4178	Value *PtrJ = RtCheck.Pointers[j];
4179
4180	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
4181	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
4182	if (ASi != ASj) {
4183	DEBUG(dbgs() << "LV: Runtime check would require comparison between"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0)
4184	" different address spaces\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0);
4185	return false;
4186	}
4187	}
4188	}
4189
4190	return CanDoRT;
4191	}
4192
4193	void AccessAnalysis::processMemAccesses() {
4194	// We process the set twice: first we process read-write pointers, last we
4195	// process read-only pointers. This allows us to skip dependence tests for
4196	// read-only pointers.
4197
4198	DEBUG(dbgs() << "LV: Processing memory accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Processing memory accesses...\n" ; } } while (0);
4199	DEBUG(dbgs() << " AST: "; AST.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " AST: "; AST.dump(); } } while (0);
4200	DEBUG(dbgs() << "LV: Accesses:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Accesses:\n"; } } while (0);
4201	DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4202	for (auto A : Accesses)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4203	dbgs() << "\t" << A.getPointer() << " (" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4204	(A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4205	"read-only" : "read")) << ")\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4206	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0);
4207
4208	// The AliasSetTracker has nicely partitioned our pointers by metadata
4209	// compatibility and potential for underlying-object overlap. As a result, we
4210	// only need to check for potential pointer dependencies within each alias
4211	// set.
4212	for (auto &AS : AST) {
4213	// Note that both the alias-set tracker and the alias sets themselves used
4214	// linked lists internally and so the iteration order here is deterministic
4215	// (matching the original instruction order within each set).
4216
4217	bool SetHasWrite = false;
4218
4219	// Map of pointers to last access encountered.
4220	typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
4221	UnderlyingObjToAccessMap ObjToLastAccess;
4222
4223	// Set of access to check after all writes have been processed.
4224	PtrAccessSet DeferredAccesses;
4225
4226	// Iterate over each alias set twice, once to process read/write pointers,
4227	// and then to process read-only pointers.
4228	for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
4229	bool UseDeferred = SetIteration > 0;
4230	PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
4231
4232	for (auto A : AS) {
4233	Value *Ptr = A.getValue();
4234	bool IsWrite = S.count(MemAccessInfo(Ptr, true));
4235
4236	// If we're using the deferred access set, then it contains only reads.
4237	bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
4238	if (UseDeferred && !IsReadOnlyPtr)
4239	continue;
4240	// Otherwise, the pointer must be in the PtrAccessSet, either as a read
4241	// or a write.
4242	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__))
4243	S.count(MemAccessInfo(Ptr, false))) &&((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__))
4244	"Alias-set pointer not in the access set?")((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__));
4245
4246	MemAccessInfo Access(Ptr, IsWrite);
4247	DepCands.insert(Access);
4248
4249	// Memorize read-only pointers for later processing and skip them in the
4250	// first round (they need to be checked after we have seen all write
4251	// pointers). Note: we also mark pointer that are not consecutive as
4252	// "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
4253	// the second check for "!IsWrite".
4254	if (!UseDeferred && IsReadOnlyPtr) {
4255	DeferredAccesses.insert(Access);
4256	continue;
4257	}
4258
4259	// If this is a write - check other reads and writes for conflicts. If
4260	// this is a read only check other writes for conflicts (but only if
4261	// there is no other write to the ptr - this is an optimization to
4262	// catch "a[i] = a[i] + " without having to do a dependence check).
4263	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
4264	CheckDeps.insert(Access);
4265	IsRTCheckNeeded = true;
4266	}
4267
4268	if (IsWrite)
4269	SetHasWrite = true;
4270
4271	// Create sets of pointers connected by a shared alias set and
4272	// underlying object.
4273	typedef SmallVector<Value *, 16> ValueVector;
4274	ValueVector TempObjects;
4275	GetUnderlyingObjects(Ptr, TempObjects, DL);
4276	for (Value *UnderlyingObj : TempObjects) {
4277	UnderlyingObjToAccessMap::iterator Prev =
4278	ObjToLastAccess.find(UnderlyingObj);
4279	if (Prev != ObjToLastAccess.end())
4280	DepCands.unionSets(Access, Prev->second);
4281
4282	ObjToLastAccess[UnderlyingObj] = Access;
4283	}
4284	}
4285	}
4286	}
4287	}
4288
4289	namespace {
4290	/// \brief Checks memory dependences among accesses to the same underlying
4291	/// object to determine whether there vectorization is legal or not (and at
4292	/// which vectorization factor).
4293	///
4294	/// This class works under the assumption that we already checked that memory
4295	/// locations with different underlying pointers are "must-not alias".
4296	/// We use the ScalarEvolution framework to symbolically evalutate access
4297	/// functions pairs. Since we currently don't restructure the loop we can rely
4298	/// on the program order of memory accesses to determine their safety.
4299	/// At the moment we will only deem accesses as safe for:
4300	/// * A negative constant distance assuming program order.
4301	///
4302	/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
4303	/// a[i] = tmp; y = a[i];
4304	///
4305	/// The latter case is safe because later checks guarantuee that there can't
4306	/// be a cycle through a phi node (that is, we check that "x" and "y" is not
4307	/// the same variable: a header phi can only be an induction or a reduction, a
4308	/// reduction can't have a memory sink, an induction can't have a memory
4309	/// source). This is important and must not be violated (or we have to
4310	/// resort to checking for cycles through memory).
4311	///
4312	/// * A positive constant distance assuming program order that is bigger
4313	/// than the biggest memory access.
4314	///
4315	/// tmp = a[i] OR b[i] = x
4316	/// a[i+2] = tmp y = b[i+2];
4317	///
4318	/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
4319	///
4320	/// * Zero distances and all accesses have the same size.
4321	///
4322	class MemoryDepChecker {
4323	public:
4324	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
4325	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
4326
4327	MemoryDepChecker(ScalarEvolution Se, const DataLayout Dl, const Loop *L)
4328	: SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
4329	ShouldRetryWithRuntimeCheck(false) {}
4330
4331	/// \brief Register the location (instructions are given increasing numbers)
4332	/// of a write access.
4333	void addAccess(StoreInst *SI) {
4334	Value *Ptr = SI->getPointerOperand();
4335	Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
4336	InstMap.push_back(SI);
4337	++AccessIdx;
4338	}
4339
4340	/// \brief Register the location (instructions are given increasing numbers)
4341	/// of a write access.
4342	void addAccess(LoadInst *LI) {
4343	Value *Ptr = LI->getPointerOperand();
4344	Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
4345	InstMap.push_back(LI);
4346	++AccessIdx;
4347	}
4348
4349	/// \brief Check whether the dependencies between the accesses are safe.
4350	///
4351	/// Only checks sets with elements in \p CheckDeps.
4352	bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
4353	MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
4354
4355	/// \brief The maximum number of bytes of a vector register we can vectorize
4356	/// the accesses safely with.
4357	unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
4358
4359	/// \brief In same cases when the dependency check fails we can still
4360	/// vectorize the loop with a dynamic array access check.
4361	bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
4362
4363	private:
4364	ScalarEvolution *SE;
4365	const DataLayout *DL;
4366	const Loop *InnermostLoop;
4367
4368	/// \brief Maps access locations (ptr, read/write) to program order.
4369	DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
4370
4371	/// \brief Memory access instructions in program order.
4372	SmallVector<Instruction *, 16> InstMap;
4373
4374	/// \brief The program order index to be used for the next instruction.
4375	unsigned AccessIdx;
4376
4377	// We can access this many bytes in parallel safely.
4378	unsigned MaxSafeDepDistBytes;
4379
4380	/// \brief If we see a non-constant dependence distance we can still try to
4381	/// vectorize this loop with runtime checks.
4382	bool ShouldRetryWithRuntimeCheck;
4383
4384	/// \brief Check whether there is a plausible dependence between the two
4385	/// accesses.
4386	///
4387	/// Access \p A must happen before \p B in program order. The two indices
4388	/// identify the index into the program order map.
4389	///
4390	/// This function checks whether there is a plausible dependence (or the
4391	/// absence of such can't be proved) between the two accesses. If there is a
4392	/// plausible dependence but the dependence distance is bigger than one
4393	/// element access it records this distance in \p MaxSafeDepDistBytes (if this
4394	/// distance is smaller than any other distance encountered so far).
4395	/// Otherwise, this function returns true signaling a possible dependence.
4396	bool isDependent(const MemAccessInfo &A, unsigned AIdx,
4397	const MemAccessInfo &B, unsigned BIdx,
4398	ValueToValueMap &Strides);
4399
4400	/// \brief Check whether the data dependence could prevent store-load
4401	/// forwarding.
4402	bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
4403	};
4404
4405	} // end anonymous namespace
4406
4407	static bool isInBoundsGep(Value *Ptr) {
4408	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
4409	return GEP->isInBounds();
4410	return false;
4411	}
4412
4413	/// \brief Check whether the access through \p Ptr has a constant stride.
4414	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
4415	const Loop *Lp, ValueToValueMap &StridesMap) {
4416	const Type *Ty = Ptr->getType();
4417	assert(Ty->isPointerTy() && "Unexpected non-ptr")((Ty->isPointerTy() && "Unexpected non-ptr") ? static_cast <void> (0) : __assert_fail ("Ty->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4417, __PRETTY_FUNCTION__));
4418
4419	// Make sure that the pointer does not point to aggregate types.
4420	const PointerType *PtrTy = cast<PointerType>(Ty);
4421	if (PtrTy->getElementType()->isAggregateType()) {
4422	DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << Ptr << "\n"; } } while (0)
4423	"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0);
4424	return 0;
4425	}
4426
4427	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
4428
4429	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
4430	if (!AR) {
4431	DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4432	<< Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4433	return 0;
4434	}
4435
4436	// The accesss function must stride over the innermost loop.
4437	if (Lp != AR->getLoop()) {
4438	DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4439	Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4440	}
4441
4442	// The address calculation must not wrap. Otherwise, a dependence could be
4443	// inverted.
4444	// An inbounds getelementptr that is a AddRec with a unit stride
4445	// cannot wrap per definition. The unit stride requirement is checked later.
4446	// An getelementptr without an inbounds attribute and unit stride would have
4447	// to access the pointer value "0" which is undefined behavior in address
4448	// space 0, therefore we can also vectorize this case.
4449	bool IsInBoundsGEP = isInBoundsGep(Ptr);
4450	bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
4451	bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
4452	if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
4453	DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4454	<< Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4455	return 0;
4456	}
4457
4458	// Check the step is constant.
4459	const SCEV Step = AR->getStepRecurrence(SE);
4460
4461	// Calculate the pointer stride and check if it is consecutive.
4462	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4463	if (!C) {
4464	DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0)
4465	" SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0);
4466	return 0;
4467	}
4468
4469	int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
4470	const APInt &APStepVal = C->getValue()->getValue();
4471
4472	// Huge step value - give up.
4473	if (APStepVal.getBitWidth() > 64)
4474	return 0;
4475
4476	int64_t StepVal = APStepVal.getSExtValue();
4477
4478	// Strided access.
4479	int64_t Stride = StepVal / Size;
4480	int64_t Rem = StepVal % Size;
4481	if (Rem)
4482	return 0;
4483
4484	// If the SCEV could wrap but we have an inbounds gep with a unit stride we
4485	// know we can't "wrap around the address space". In case of address space
4486	// zero we know that this won't happen without triggering undefined behavior.
4487	if (!IsNoWrapAddRec && (IsInBoundsGEP \|\| IsInAddressSpaceZero) &&
4488	Stride != 1 && Stride != -1)
4489	return 0;
4490
4491	return Stride;
4492	}
4493
4494	bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
4495	unsigned TypeByteSize) {
4496	// If loads occur at a distance that is not a multiple of a feasible vector
4497	// factor store-load forwarding does not take place.
4498	// Positive dependences might cause troubles because vectorizing them might
4499	// prevent store-load forwarding making vectorized code run a lot slower.
4500	// a[i] = a[i-3] ^ a[i-8];
4501	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
4502	// hence on your typical architecture store-load forwarding does not take
4503	// place. Vectorizing in such cases does not make sense.
4504	// Store-load forwarding distance.
4505	const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
4506	// Maximum vector factor.
4507	unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
4508	if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
4509	MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
4510
4511	for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
4512	vf *= 2) {
4513	if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
4514	MaxVFWithoutSLForwardIssues = (vf >>=1);
4515	break;
4516	}
4517	}
4518
4519	if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
4520	DEBUG(dbgs() << "LV: Distance " << Distance <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0)
4521	" that could cause a store-load forwarding conflict\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0);
4522	return true;
4523	}
4524
4525	if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
4526	MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
4527	MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
4528	return false;
4529	}
4530
4531	bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
4532	const MemAccessInfo &B, unsigned BIdx,
4533	ValueToValueMap &Strides) {
4534	assert (AIdx < BIdx && "Must pass arguments in program order")((AIdx < BIdx && "Must pass arguments in program order" ) ? static_cast<void> (0) : __assert_fail ("AIdx < BIdx && \"Must pass arguments in program order\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4534, __PRETTY_FUNCTION__));
4535
4536	Value *APtr = A.getPointer();
4537	Value *BPtr = B.getPointer();
4538	bool AIsWrite = A.getInt();
4539	bool BIsWrite = B.getInt();
4540
4541	// Two reads are independent.
4542	if (!AIsWrite && !BIsWrite)
4543	return false;
4544
4545	// We cannot check pointers in different address spaces.
4546	if (APtr->getType()->getPointerAddressSpace() !=
4547	BPtr->getType()->getPointerAddressSpace())
4548	return true;
4549
4550	const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
4551	const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
4552
4553	int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
4554	int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
4555
4556	const SCEV *Src = AScev;
4557	const SCEV *Sink = BScev;
4558
4559	// If the induction step is negative we have to invert source and sink of the
4560	// dependence.
4561	if (StrideAPtr < 0) {
4562	//Src = BScev;
4563	//Sink = AScev;
4564	std::swap(APtr, BPtr);
4565	std::swap(Src, Sink);
4566	std::swap(AIsWrite, BIsWrite);
4567	std::swap(AIdx, BIdx);
4568	std::swap(StrideAPtr, StrideBPtr);
4569	}
4570
4571	const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
4572
4573	DEBUG(dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sinkdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0)
4574	<< "(Induction step: " << StrideAPtr << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0);
4575	DEBUG(dbgs() << "LV: Distance for " << InstMap[AIdx] << " to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << InstMap[AIdx] << " to " << InstMap[BIdx] << ": " << Dist << "\n"; } } while (0)
4576	<< InstMap[BIdx] << ": " << Dist << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << InstMap[AIdx] << " to " << InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0);
4577
4578	// Need consecutive accesses. We don't want to vectorize
4579	// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
4580	// the address space.
4581	if (!StrideAPtr \|\| !StrideBPtr \|\| StrideAPtr != StrideBPtr){
4582	DEBUG(dbgs() << "Non-consecutive pointer access\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Non-consecutive pointer access\n" ; } } while (0);
4583	return true;
4584	}
4585
4586	const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
4587	if (!C) {
4588	DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence because of non-constant distance\n" ; } } while (0);
4589	ShouldRetryWithRuntimeCheck = true;
4590	return true;
4591	}
4592
4593	Type *ATy = APtr->getType()->getPointerElementType();
4594	Type *BTy = BPtr->getType()->getPointerElementType();
4595	unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
4596
4597	// Negative distances are not plausible dependencies.
4598	const APInt &Val = C->getValue()->getValue();
4599	if (Val.isNegative()) {
4600	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
4601	if (IsTrueDataDependence &&
4602	(couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) \|\|
4603	ATy != BTy))
4604	return true;
4605
4606	DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence is negative: NoDep\n" ; } } while (0);
4607	return false;
4608	}
4609
4610	// Write to the same location with the same size.
4611	// Could be improved to assert type sizes are the same (i32 == float, etc).
4612	if (Val == 0) {
4613	if (ATy == BTy)
4614	return false;
4615	DEBUG(dbgs() << "LV: Zero dependence difference but different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Zero dependence difference but different types\n" ; } } while (0);
4616	return true;
4617	}
4618
4619	assert(Val.isStrictlyPositive() && "Expect a positive value")((Val.isStrictlyPositive() && "Expect a positive value" ) ? static_cast<void> (0) : __assert_fail ("Val.isStrictlyPositive() && \"Expect a positive value\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4619, __PRETTY_FUNCTION__));
4620
4621	// Positive distance bigger than max vectorization factor.
4622	if (ATy != BTy) {
4623	DEBUG(dbgs() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0)
4624	"LV: ReadWrite-Write positive dependency with different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0);
4625	return false;
4626	}
4627
4628	unsigned Distance = (unsigned) Val.getZExtValue();
4629
4630	// Bail out early if passed-in parameters make vectorization not feasible.
4631	unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
4632	unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
4633
4634	// The distance must be bigger than the size needed for a vectorized version
4635	// of the operation and the size of the vectorized operation must not be
4636	// bigger than the currrent maximum size.
4637	if (Distance < 2*TypeByteSize \|\|
4638	2*TypeByteSize > MaxSafeDepDistBytes \|\|
4639	Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
4640	DEBUG(dbgs() << "LV: Failure because of Positive distance "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0)
4641	<< Val.getSExtValue() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0);
4642	return true;
4643	}
4644
4645	MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
4646	Distance : MaxSafeDepDistBytes;
4647
4648	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
4649	if (IsTrueDataDependence &&
4650	couldPreventStoreLoadForward(Distance, TypeByteSize))
4651	return true;
4652
4653	DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0)
4654	" with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0);
4655
4656	return false;
4657	}
4658
4659	bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
4660	MemAccessInfoSet &CheckDeps,
4661	ValueToValueMap &Strides) {
4662
4663	MaxSafeDepDistBytes = -1U;
4664	while (!CheckDeps.empty()) {
4665	MemAccessInfo CurAccess = *CheckDeps.begin();
4666
4667	// Get the relevant memory access set.
4668	EquivalenceClasses<MemAccessInfo>::iterator I =
4669	AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
4670
4671	// Check accesses within this set.
4672	EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
4673	AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
4674
4675	// Check every access pair.
4676	while (AI != AE) {
4677	CheckDeps.erase(*AI);
4678	EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
4679	while (OI != AE) {
4680	// Check every accessing instruction pair in program order.
4681	for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
4682	I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
4683	for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
4684	I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
4685	if (I1 < I2 && isDependent(AI, I1, OI, I2, Strides))
4686	return false;
4687	if (I2 < I1 && isDependent(OI, I2, AI, I1, Strides))
4688	return false;
4689	}
4690	++OI;
4691	}
4692	AI++;
4693	}
4694	}
4695	return true;
4696	}
4697
4698	bool LoopVectorizationLegality::canVectorizeMemory() {
4699
4700	typedef SmallVector<Value*, 16> ValueVector;
4701	typedef SmallPtrSet<Value*, 16> ValueSet;
4702
4703	// Holds the Load and Store instructions.
4704	ValueVector Loads;
4705	ValueVector Stores;
4706
4707	// Holds all the different accesses in the loop.
4708	unsigned NumReads = 0;
4709	unsigned NumReadWrites = 0;
4710
4711	PtrRtCheck.Pointers.clear();
4712	PtrRtCheck.Need = false;
4713
4714	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
4715	MemoryDepChecker DepChecker(SE, DL, TheLoop);
4716
4717	// For each block.
4718	for (Loop::block_iterator bb = TheLoop->block_begin(),
4719	be = TheLoop->block_end(); bb != be; ++bb) {
4720
4721	// Scan the BB and collect legal loads and stores.
4722	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
4723	++it) {
4724
4725	// If this is a load, save it. If this instruction can read from memory
4726	// but is not a load, then we quit. Notice that we don't handle function
4727	// calls that read or write.
4728	if (it->mayReadFromMemory()) {
4729	// Many math library functions read the rounding mode. We will only
4730	// vectorize a loop if it contains known function calls that don't set
4731	// the flag. Therefore, it is safe to ignore this read from memory.
4732	CallInst *Call = dyn_cast<CallInst>(it);
4733	if (Call && getIntrinsicIDForCall(Call, TLI))
4734	continue;
4735
4736	LoadInst *Ld = dyn_cast<LoadInst>(it);
4737	if (!Ld \|\| (!Ld->isSimple() && !IsAnnotatedParallel)) {
4738	emitAnalysis(Report(Ld)
4739	<< "read with atomic ordering or volatile read");
4740	DEBUG(dbgs() << "LV: Found a non-simple load.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple load.\n" ; } } while (0);
4741	return false;
4742	}
4743	NumLoads++;
4744	Loads.push_back(Ld);
4745	DepChecker.addAccess(Ld);
4746	continue;
4747	}
4748
4749	// Save 'store' instructions. Abort if other instructions write to memory.
4750	if (it->mayWriteToMemory()) {
4751	StoreInst *St = dyn_cast<StoreInst>(it);
4752	if (!St) {
4753	emitAnalysis(Report(it) << "instruction cannot be vectorized");
4754	return false;
4755	}
4756	if (!St->isSimple() && !IsAnnotatedParallel) {
4757	emitAnalysis(Report(St)
4758	<< "write with atomic ordering or volatile write");
4759	DEBUG(dbgs() << "LV: Found a non-simple store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple store.\n" ; } } while (0);
4760	return false;
4761	}
4762	NumStores++;
4763	Stores.push_back(St);
4764	DepChecker.addAccess(St);
4765	}
4766	} // Next instr.
4767	} // Next block.
4768
4769	// Now we have two lists that hold the loads and the stores.
4770	// Next, we find the pointers that they use.
4771
4772	// Check if we see any stores. If there are no stores, then we don't
4773	// care if the pointers are restrict.
4774	if (!Stores.size()) {
4775	DEBUG(dbgs() << "LV: Found a read-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a read-only loop!\n" ; } } while (0);
4776	return true;
4777	}
4778
4779	AccessAnalysis::DepCandidates DependentAccesses;
4780	AccessAnalysis Accesses(DL, AA, DependentAccesses);
4781
4782	// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
4783	// multiple times on the same object. If the ptr is accessed twice, once
4784	// for read and once for write, it will only appear once (on the write
4785	// list). This is okay, since we are going to check for conflicts between
4786	// writes and between reads and writes, but not between reads and reads.
4787	ValueSet Seen;
4788
4789	ValueVector::iterator I, IE;
4790	for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
4791	StoreInst ST = cast<StoreInst>(I);
4792	Value* Ptr = ST->getPointerOperand();
4793
4794	if (isUniform(Ptr)) {
4795	emitAnalysis(
4796	Report(ST)
4797	<< "write to a loop invariant address could not be vectorized");
4798	DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We don't allow storing to uniform addresses\n" ; } } while (0);
4799	return false;
4800	}
4801
4802	// If we did not see this pointer before, insert it to the read-write
4803	// list. At this phase it is only a 'write' list.
4804	if (Seen.insert(Ptr).second) {
4805	++NumReadWrites;
4806
4807	AliasAnalysis::Location Loc = AA->getLocation(ST);
4808	// The TBAA metadata could have a control dependency on the predication
4809	// condition, so we cannot rely on it when determining whether or not we
4810	// need runtime pointer checks.
4811	if (blockNeedsPredication(ST->getParent()))
4812	Loc.AATags.TBAA = nullptr;
4813
4814	Accesses.addStore(Loc);
4815	}
4816	}
4817
4818	if (IsAnnotatedParallel) {
4819	DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0)
4820	<< "LV: A loop annotated parallel, ignore memory dependency "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0)
4821	<< "checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0);
4822	return true;
4823	}
4824
4825	for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
4826	LoadInst LD = cast<LoadInst>(I);
4827	Value* Ptr = LD->getPointerOperand();
4828	// If we did not see this pointer before, insert it to the
4829	// read list. If we did see it before, then it is already in
4830	// the read-write list. This allows us to vectorize expressions
4831	// such as A[i] += x; Because the address of A[i] is a read-write
4832	// pointer. This only works if the index of A[i] is consecutive.
4833	// If the address of i is unknown (for example A[B[i]]) then we may
4834	// read a few words, modify, and write a few words, and some of the
4835	// words may be written to the same address.
4836	bool IsReadOnlyPtr = false;
4837	if (Seen.insert(Ptr).second \|\|
4838	!isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
4839	++NumReads;
4840	IsReadOnlyPtr = true;
4841	}
4842
4843	AliasAnalysis::Location Loc = AA->getLocation(LD);
4844	// The TBAA metadata could have a control dependency on the predication
4845	// condition, so we cannot rely on it when determining whether or not we
4846	// need runtime pointer checks.
4847	if (blockNeedsPredication(LD->getParent()))
4848	Loc.AATags.TBAA = nullptr;
4849
4850	Accesses.addLoad(Loc, IsReadOnlyPtr);
4851	}
4852
4853	// If we write (or read-write) to a single destination and there are no
4854	// other reads in this loop then is it safe to vectorize.
4855	if (NumReadWrites == 1 && NumReads == 0) {
4856	DEBUG(dbgs() << "LV: Found a write-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a write-only loop!\n" ; } } while (0);
4857	return true;
4858	}
4859
4860	// Build dependence sets and check whether we need a runtime pointer bounds
4861	// check.
4862	Accesses.buildDependenceSets();
4863	bool NeedRTCheck = Accesses.isRTCheckNeeded();
4864
4865	// Find pointers with computable bounds. We are going to use this information
4866	// to place a runtime bound check.
4867	unsigned NumComparisons = 0;
4868	bool CanDoRT = false;
4869	if (NeedRTCheck)
4870	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
4871	Strides);
4872
4873	DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0)
4874	" pointer comparisons.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0);
4875
4876	// If we only have one set of dependences to check pointers among we don't
4877	// need a runtime check.
4878	if (NumComparisons == 0 && NeedRTCheck)
4879	NeedRTCheck = false;
4880
4881	// Check that we did not collect too many pointers or found an unsizeable
4882	// pointer.
4883	if (!CanDoRT \|\| NumComparisons > RuntimeMemoryCheckThreshold) {
4884	PtrRtCheck.reset();
4885	CanDoRT = false;
4886	}
4887
4888	if (CanDoRT) {
4889	DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can perform a memory runtime check if needed.\n" ; } } while (0);
4890	}
4891
4892	if (NeedRTCheck && !CanDoRT) {
4893	emitAnalysis(Report() << "cannot identify array bounds");
4894	DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0)
4895	"the array bounds.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0);
4896	PtrRtCheck.reset();
4897	return false;
4898	}
4899
4900	PtrRtCheck.Need = NeedRTCheck;
4901
4902	bool CanVecMem = true;
4903	if (Accesses.isDependencyCheckNeeded()) {
4904	DEBUG(dbgs() << "LV: Checking memory dependencies\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Checking memory dependencies\n" ; } } while (0);
4905	CanVecMem = DepChecker.areDepsSafe(
4906	DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
4907	MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
4908
4909	if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
4910	DEBUG(dbgs() << "LV: Retrying with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Retrying with memory checks\n" ; } } while (0);
4911	NeedRTCheck = true;
4912
4913	// Clear the dependency checks. We assume they are not needed.
4914	Accesses.resetDepChecks();
4915
4916	PtrRtCheck.reset();
4917	PtrRtCheck.Need = true;
4918
4919	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
4920	TheLoop, Strides, true);
4921	// Check that we did not collect too many pointers or found an unsizeable
4922	// pointer.
4923	if (!CanDoRT \|\| NumComparisons > RuntimeMemoryCheckThreshold) {
4924	if (!CanDoRT && NumComparisons > 0)
4925	emitAnalysis(Report()
4926	<< "cannot check memory dependencies at runtime");
4927	else
4928	emitAnalysis(Report()
4929	<< NumComparisons << " exceeds limit of "
4930	<< RuntimeMemoryCheckThreshold
4931	<< " dependent memory operations checked at runtime");
4932	DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize with memory checks\n" ; } } while (0);
4933	PtrRtCheck.reset();
4934	return false;
4935	}
4936
4937	CanVecMem = true;
4938	}
4939	}
4940
4941	if (!CanVecMem)
4942	emitAnalysis(Report() << "unsafe dependent memory operations in loop");
4943
4944	DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0)
4945	" need a runtime memory check.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0);
4946
4947	return CanVecMem;
4948	}
4949
4950	static bool hasMultipleUsesOf(Instruction *I,
4951	SmallPtrSetImpl<Instruction *> &Insts) {
4952	unsigned NumUses = 0;
4953	for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
4954	if (Insts.count(dyn_cast<Instruction>(*Use)))
4955	++NumUses;
4956	if (NumUses > 1)
4957	return true;
4958	}
4959
4960	return false;
4961	}
4962
4963	static bool areAllUsesIn(Instruction I, SmallPtrSetImpl<Instruction > &Set) {
4964	for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
4965	if (!Set.count(dyn_cast<Instruction>(*Use)))
4966	return false;
4967	return true;
4968	}
4969
4970	bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
4971	ReductionKind Kind) {
4972	if (Phi->getNumIncomingValues() != 2)
4973	return false;
4974
4975	// Reduction variables are only found in the loop header block.
4976	if (Phi->getParent() != TheLoop->getHeader())
4977	return false;
4978
4979	// Obtain the reduction start value from the value that comes from the loop
4980	// preheader.
4981	Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
4982
4983	// ExitInstruction is the single value which is used outside the loop.
4984	// We only allow for a single reduction value to be used outside the loop.
4985	// This includes users of the reduction, variables (which form a cycle
4986	// which ends in the phi node).
4987	Instruction *ExitInstruction = nullptr;
4988	// Indicates that we found a reduction operation in our scan.
4989	bool FoundReduxOp = false;
4990
4991	// We start with the PHI node and scan for all of the users of this
4992	// instruction. All users must be instructions that can be used as reduction
4993	// variables (such as ADD). We must have a single out-of-block user. The cycle
4994	// must include the original PHI.
4995	bool FoundStartPHI = false;
4996
4997	// To recognize min/max patterns formed by a icmp select sequence, we store
4998	// the number of instruction we saw from the recognized min/max pattern,
4999	// to make sure we only see exactly the two instructions.
5000	unsigned NumCmpSelectPatternInst = 0;
5001	ReductionInstDesc ReduxDesc(false, nullptr);
5002
5003	SmallPtrSet<Instruction *, 8> VisitedInsts;
5004	SmallVector<Instruction *, 8> Worklist;
5005	Worklist.push_back(Phi);
5006	VisitedInsts.insert(Phi);
5007
5008	// A value in the reduction can be used:
5009	// - By the reduction:
5010	// - Reduction operation:
5011	// - One use of reduction value (safe).
5012	// - Multiple use of reduction value (not safe).
5013	// - PHI:
5014	// - All uses of the PHI must be the reduction (safe).
5015	// - Otherwise, not safe.
5016	// - By one instruction outside of the loop (safe).
5017	// - By further instructions outside of the loop (not safe).
5018	// - By an instruction that is not part of the reduction (not safe).
5019	// This is either:
5020	// * An instruction type other than PHI or the reduction operation.
5021	// * A PHI in the header other than the initial PHI.
5022	while (!Worklist.empty()) {
5023	Instruction *Cur = Worklist.back();
5024	Worklist.pop_back();
5025
5026	// No Users.
5027	// If the instruction has no users then this is a broken chain and can't be
5028	// a reduction variable.
5029	if (Cur->use_empty())
5030	return false;
5031
5032	bool IsAPhi = isa<PHINode>(Cur);
5033
5034	// A header PHI use other than the original PHI.
5035	if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
5036	return false;
5037
5038	// Reductions of instructions such as Div, and Sub is only possible if the
5039	// LHS is the reduction variable.
5040	if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
5041	!isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
5042	!VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
5043	return false;
5044
5045	// Any reduction instruction must be of one of the allowed kinds.
5046	ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
5047	if (!ReduxDesc.IsReduction)
5048	return false;
5049
5050	// A reduction operation must only have one use of the reduction value.
5051	if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
5052	hasMultipleUsesOf(Cur, VisitedInsts))
5053	return false;
5054
5055	// All inputs to a PHI node must be a reduction value.
5056	if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
5057	return false;
5058
5059	if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) \|\|
5060	isa<SelectInst>(Cur)))
5061	++NumCmpSelectPatternInst;
5062	if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) \|\|
5063	isa<SelectInst>(Cur)))
5064	++NumCmpSelectPatternInst;
5065
5066	// Check whether we found a reduction operator.
5067	FoundReduxOp \|= !IsAPhi;
5068
5069	// Process users of current instruction. Push non-PHI nodes after PHI nodes
5070	// onto the stack. This way we are going to have seen all inputs to PHI
5071	// nodes once we get to them.
5072	SmallVector<Instruction *, 8> NonPHIs;
5073	SmallVector<Instruction *, 8> PHIs;
5074	for (User *U : Cur->users()) {
5075	Instruction *UI = cast<Instruction>(U);
5076
5077	// Check if we found the exit user.
5078	BasicBlock *Parent = UI->getParent();
5079	if (!TheLoop->contains(Parent)) {
5080	// Exit if you find multiple outside users or if the header phi node is
5081	// being used. In this case the user uses the value of the previous
5082	// iteration, in which case we would loose "VF-1" iterations of the
5083	// reduction operation if we vectorize.
5084	if (ExitInstruction != nullptr \|\| Cur == Phi)
5085	return false;
5086
5087	// The instruction used by an outside user must be the last instruction
5088	// before we feed back to the reduction phi. Otherwise, we loose VF-1
5089	// operations on the value.
5090	if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
5091	return false;
5092
5093	ExitInstruction = Cur;
5094	continue;
5095	}
5096
5097	// Process instructions only once (termination). Each reduction cycle
5098	// value must only be used once, except by phi nodes and min/max
5099	// reductions which are represented as a cmp followed by a select.
5100	ReductionInstDesc IgnoredVal(false, nullptr);
5101	if (VisitedInsts.insert(UI).second) {
5102	if (isa<PHINode>(UI))
5103	PHIs.push_back(UI);
5104	else
5105	NonPHIs.push_back(UI);
5106	} else if (!isa<PHINode>(UI) &&
5107	((!isa<FCmpInst>(UI) &&
5108	!isa<ICmpInst>(UI) &&
5109	!isa<SelectInst>(UI)) \|\|
5110	!isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
5111	return false;
5112
5113	// Remember that we completed the cycle.
5114	if (UI == Phi)
5115	FoundStartPHI = true;
5116	}
5117	Worklist.append(PHIs.begin(), PHIs.end());
5118	Worklist.append(NonPHIs.begin(), NonPHIs.end());
5119	}
5120
5121	// This means we have seen one but not the other instruction of the
5122	// pattern or more than just a select and cmp.
5123	if ((Kind == RK_IntegerMinMax \|\| Kind == RK_FloatMinMax) &&
5124	NumCmpSelectPatternInst != 2)
5125	return false;
5126
5127	if (!FoundStartPHI \|\| !FoundReduxOp \|\| !ExitInstruction)
5128	return false;
5129
5130	// We found a reduction var if we have reached the original phi node and we
5131	// only have a single instruction with out-of-loop users.
5132
5133	// This instruction is allowed to have out-of-loop users.
5134	AllowedExit.insert(ExitInstruction);
5135
5136	// Save the description of this reduction variable.
5137	ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
5138	ReduxDesc.MinMaxKind);
5139	Reductions[Phi] = RD;
5140	// We've ended the cycle. This is a reduction variable if we have an
5141	// outside user and it has a binary op.
5142
5143	return true;
5144	}
5145
5146	/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
5147	/// pattern corresponding to a min(X, Y) or max(X, Y).
5148	LoopVectorizationLegality::ReductionInstDesc
5149	LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
5150	ReductionInstDesc &Prev) {
5151
5152	assert((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) &&(((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5153, __PRETTY_FUNCTION__))
5153	"Expect a select instruction")(((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5153, __PRETTY_FUNCTION__));
5154	Instruction *Cmp = nullptr;
5155	SelectInst *Select = nullptr;
5156
5157	// We must handle the select(cmp()) as a single instruction. Advance to the
5158	// select.
5159	if ((Cmp = dyn_cast<ICmpInst>(I)) \|\| (Cmp = dyn_cast<FCmpInst>(I))) {
5160	if (!Cmp->hasOneUse() \|\| !(Select = dyn_cast<SelectInst>(*I->user_begin())))
5161	return ReductionInstDesc(false, I);
5162	return ReductionInstDesc(Select, Prev.MinMaxKind);
5163	}
5164
5165	// Only handle single use cases for now.
5166	if (!(Select = dyn_cast<SelectInst>(I)))
5167	return ReductionInstDesc(false, I);
5168	if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
5169	!(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
5170	return ReductionInstDesc(false, I);
5171	if (!Cmp->hasOneUse())
5172	return ReductionInstDesc(false, I);
5173
5174	Value *CmpLeft;
5175	Value *CmpRight;
5176
5177	// Look for a min/max pattern.
5178	if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5179	return ReductionInstDesc(Select, MRK_UIntMin);
5180	else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5181	return ReductionInstDesc(Select, MRK_UIntMax);
5182	else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5183	return ReductionInstDesc(Select, MRK_SIntMax);
5184	else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5185	return ReductionInstDesc(Select, MRK_SIntMin);
5186	else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5187	return ReductionInstDesc(Select, MRK_FloatMin);
5188	else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5189	return ReductionInstDesc(Select, MRK_FloatMax);
5190	else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5191	return ReductionInstDesc(Select, MRK_FloatMin);
5192	else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5193	return ReductionInstDesc(Select, MRK_FloatMax);
5194
5195	return ReductionInstDesc(false, I);
5196	}
5197
5198	LoopVectorizationLegality::ReductionInstDesc
5199	LoopVectorizationLegality::isReductionInstr(Instruction *I,
5200	ReductionKind Kind,
5201	ReductionInstDesc &Prev) {
5202	bool FP = I->getType()->isFloatingPointTy();
5203	bool FastMath = FP && I->hasUnsafeAlgebra();
5204	switch (I->getOpcode()) {
5205	default:
5206	return ReductionInstDesc(false, I);
5207	case Instruction::PHI:
5208	if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
5209	Kind != RK_FloatMinMax))
5210	return ReductionInstDesc(false, I);
5211	return ReductionInstDesc(I, Prev.MinMaxKind);
5212	case Instruction::Sub:
5213	case Instruction::Add:
5214	return ReductionInstDesc(Kind == RK_IntegerAdd, I);
5215	case Instruction::Mul:
5216	return ReductionInstDesc(Kind == RK_IntegerMult, I);
5217	case Instruction::And:
5218	return ReductionInstDesc(Kind == RK_IntegerAnd, I);
5219	case Instruction::Or:
5220	return ReductionInstDesc(Kind == RK_IntegerOr, I);
5221	case Instruction::Xor:
5222	return ReductionInstDesc(Kind == RK_IntegerXor, I);
5223	case Instruction::FMul:
5224	return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
5225	case Instruction::FSub:
5226	case Instruction::FAdd:
5227	return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
5228	case Instruction::FCmp:
5229	case Instruction::ICmp:
5230	case Instruction::Select:
5231	if (Kind != RK_IntegerMinMax &&
5232	(!HasFunNoNaNAttr \|\| Kind != RK_FloatMinMax))
5233	return ReductionInstDesc(false, I);
5234	return isMinMaxSelectCmpPattern(I, Prev);
5235	}
5236	}
5237
5238	LoopVectorizationLegality::InductionKind
5239	LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
5240	Type *PhiTy = Phi->getType();
5241	// We only handle integer and pointer inductions variables.
5242	if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
5243	return IK_NoInduction;
5244
5245	// Check that the PHI is consecutive.
5246	const SCEV *PhiScev = SE->getSCEV(Phi);
5247	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
5248	if (!AR) {
5249	DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: PHI is not a poly recurrence.\n" ; } } while (0);
5250	return IK_NoInduction;
5251	}
5252	const SCEV Step = AR->getStepRecurrence(SE);
5253
5254	// Integer inductions need to have a stride of one.
5255	if (PhiTy->isIntegerTy()) {
5256	if (Step->isOne())
5257	return IK_IntInduction;
5258	if (Step->isAllOnesValue())
5259	return IK_ReverseIntInduction;
5260	return IK_NoInduction;
5261	}
5262
5263	// Calculate the pointer stride and check if it is consecutive.
5264	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
5265	if (!C)
5266	return IK_NoInduction;
5267
5268	assert(PhiTy->isPointerTy() && "The PHI must be a pointer")((PhiTy->isPointerTy() && "The PHI must be a pointer" ) ? static_cast<void> (0) : __assert_fail ("PhiTy->isPointerTy() && \"The PHI must be a pointer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5268, __PRETTY_FUNCTION__));
5269	Type *PointerElementType = PhiTy->getPointerElementType();
5270	// The pointer stride cannot be determined if the pointer element type is not
5271	// sized.
5272	if (!PointerElementType->isSized())
5273	return IK_NoInduction;
5274
5275	uint64_t Size = DL->getTypeAllocSize(PointerElementType);
5276	if (C->getValue()->equalsInt(Size))
5277	return IK_PtrInduction;
5278	else if (C->getValue()->equalsInt(0 - Size))
5279	return IK_ReversePtrInduction;
5280
5281	return IK_NoInduction;
5282	}
5283
5284	bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5285	Value In0 = const_cast<Value>(V);
5286	PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5287	if (!PN)
5288	return false;
5289
5290	return Inductions.count(PN);
5291	}
5292
5293	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5294	assert(TheLoop->contains(BB) && "Unknown block used")((TheLoop->contains(BB) && "Unknown block used") ? static_cast<void> (0) : __assert_fail ("TheLoop->contains(BB) && \"Unknown block used\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5294, __PRETTY_FUNCTION__));
5295
5296	// Blocks that do not dominate the latch need predication.
5297	BasicBlock* Latch = TheLoop->getLoopLatch();
5298	return !DT->dominates(BB, Latch);
5299	}
5300
5301	bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
5302	SmallPtrSetImpl<Value *> &SafePtrs) {
5303	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5304	// We might be able to hoist the load.
5305	if (it->mayReadFromMemory()) {
5306	LoadInst *LI = dyn_cast<LoadInst>(it);
5307	if (!LI \|\| !SafePtrs.count(LI->getPointerOperand()))
5308	return false;
5309	}
5310
5311	// We don't predicate stores at the moment.
5312	if (it->mayWriteToMemory()) {
5313	StoreInst *SI = dyn_cast<StoreInst>(it);
5314	// We only support predication of stores in basic blocks with one
5315	// predecessor.
5316	if (!SI \|\| ++NumPredStores > NumberOfStoresToPredicate \|\|
5317	!SafePtrs.count(SI->getPointerOperand()) \|\|
5318	!SI->getParent()->getSinglePredecessor())
5319	return false;
5320	}
5321	if (it->mayThrow())
5322	return false;
5323
5324	// Check that we don't have a constant expression that can trap as operand.
5325	for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
5326	OI != OE; ++OI) {
5327	if (Constant C = dyn_cast<Constant>(OI))
5328	if (C->canTrap())
5329	return false;
5330	}
5331
5332	// The instructions below can trap.
5333	switch (it->getOpcode()) {
5334	default: continue;
5335	case Instruction::UDiv:
5336	case Instruction::SDiv:
5337	case Instruction::URem:
5338	case Instruction::SRem:
5339	return false;
5340	}
5341	}
5342
5343	return true;
5344	}
5345
5346	LoopVectorizationCostModel::VectorizationFactor
5347	LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
5348	// Width 1 means no vectorize
5349	VectorizationFactor Factor = { 1U, 0U };
5350	if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
5351	emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
5352	DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n" ; } } while (0);
5353	return Factor;
5354	}
5355
5356	if (!EnableCondStoresVectorization && Legal->NumPredStores) {
5357	emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
5358	DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n" ; } } while (0);
5359	return Factor;
5360	}
5361
5362	// Find the trip count.
5363	unsigned TC = SE->getSmallConstantTripCount(TheLoop);
5364	DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found trip count: " << TC << '\n'; } } while (0);
5365
5366	unsigned WidestType = getWidestType();
5367	unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5368	unsigned MaxSafeDepDist = -1U;
5369	if (Legal->getMaxSafeDepDistBytes() != -1U)
5370	MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5371	WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
5372	WidestRegister : MaxSafeDepDist);
5373	unsigned MaxVectorSize = WidestRegister / WidestType;
5374	DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"; } } while (0);
5375	DEBUG(dbgs() << "LV: The Widest register is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0)
5376	<< WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0);
5377
5378	if (MaxVectorSize == 0) {
5379	DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n" ; } } while (0);
5380	MaxVectorSize = 1;
5381	}
5382
5383	assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"((MaxVectorSize <= 32 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 32 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5384, __PRETTY_FUNCTION__))
5384	" into one vector!")((MaxVectorSize <= 32 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 32 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5384, __PRETTY_FUNCTION__));
5385
5386	unsigned VF = MaxVectorSize;
5387
5388	// If we optimize the program for size, avoid creating the tail loop.
5389	if (OptForSize) {
5390	// If we are unable to calculate the trip count then don't try to vectorize.
5391	if (TC < 2) {
5392	emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
5393	DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0);
5394	return Factor;
5395	}
5396
5397	// Find the maximum SIMD width that can fit within the trip count.
5398	VF = TC % MaxVectorSize;
5399
5400	if (VF == 0)
5401	VF = MaxVectorSize;
5402
5403	// If the trip count that we found modulo the vectorization factor is not
5404	// zero then we require a tail.
5405	if (VF < 2) {
5406	emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
5407	"same time. Enable vectorization of this loop "
5408	"with '#pragma clang loop vectorize(enable)' "
5409	"when compiling with -Os");
5410	DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0);
5411	return Factor;
5412	}
5413	}
5414
5415	int UserVF = Hints->getWidth();
5416	if (UserVF != 0) {
5417	assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5417, __PRETTY_FUNCTION__));
5418	DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using user VF " << UserVF << ".\n"; } } while (0);
5419
5420	Factor.Width = UserVF;
5421	return Factor;
5422	}
5423
5424	float Cost = expectedCost(1);
5425	#ifndef NDEBUG
5426	const float ScalarCost = Cost;
5427	#endif /* NDEBUG */
5428	unsigned Width = 1;
5429	DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"; } } while (0);
5430
5431	bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5432	// Ignore scalar width, because the user explicitly wants vectorization.
5433	if (ForceVectorization && VF > 1) {
5434	Width = 2;
5435	Cost = expectedCost(Width) / (float)Width;
5436	}
5437
5438	for (unsigned i=2; i <= VF; i*=2) {
5439	// Notice that the vector loop needs to be executed less times, so
5440	// we need to divide the cost of the vector loops by the width of
5441	// the vector elements.
5442	float VectorCost = expectedCost(i) / (float)i;
5443	DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0)
5444	(int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0);
5445	if (VectorCost < Cost) {
5446	Cost = VectorCost;
5447	Width = i;
5448	}
5449	}
5450
5451	DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0)
5452	<< "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0)
5453	<< "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0);
5454	DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Selecting VF: "<< Width << ".\n"; } } while (0);
5455	Factor.Width = Width;
5456	Factor.Cost = Width * Cost;
5457	return Factor;
5458	}
5459
5460	unsigned LoopVectorizationCostModel::getWidestType() {
5461	unsigned MaxWidth = 8;
5462
5463	// For each block.
5464	for (Loop::block_iterator bb = TheLoop->block_begin(),
5465	be = TheLoop->block_end(); bb != be; ++bb) {
5466	BasicBlock BB = bb;
5467
5468	// For each instruction in the loop.
5469	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5470	Type *T = it->getType();
5471
5472	// Ignore ephemeral values.
5473	if (EphValues.count(it))
5474	continue;
5475
5476	// Only examine Loads, Stores and PHINodes.
5477	if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
5478	continue;
5479
5480	// Examine PHI nodes that are reduction variables.
5481	if (PHINode *PN = dyn_cast<PHINode>(it))
5482	if (!Legal->getReductionVars()->count(PN))
5483	continue;
5484
5485	// Examine the stored values.
5486	if (StoreInst *ST = dyn_cast<StoreInst>(it))
5487	T = ST->getValueOperand()->getType();
5488
5489	// Ignore loaded pointer types and stored pointer types that are not
5490	// consecutive. However, we do want to take consecutive stores/loads of
5491	// pointer vectors into account.
5492	if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
5493	continue;
5494
5495	MaxWidth = std::max(MaxWidth,
5496	(unsigned)DL->getTypeSizeInBits(T->getScalarType()));
5497	}
5498	}
5499
5500	return MaxWidth;
5501	}
5502
5503	unsigned
5504	LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
5505	unsigned VF,
5506	unsigned LoopCost) {
5507
5508	// -- The unroll heuristics --
5509	// We unroll the loop in order to expose ILP and reduce the loop overhead.
5510	// There are many micro-architectural considerations that we can't predict
5511	// at this level. For example, frontend pressure (on decode or fetch) due to
5512	// code size, or the number and capabilities of the execution ports.
5513	//
5514	// We use the following heuristics to select the unroll factor:
5515	// 1. If the code has reductions, then we unroll in order to break the cross
5516	// iteration dependency.
5517	// 2. If the loop is really small, then we unroll in order to reduce the loop
5518	// overhead.
5519	// 3. We don't unroll if we think that we will spill registers to memory due
5520	// to the increased register pressure.
5521
5522	// Use the user preference, unless 'auto' is selected.
5523	int UserUF = Hints->getInterleave();
5524	if (UserUF != 0)
5525	return UserUF;
5526
5527	// When we optimize for size, we don't unroll.
5528	if (OptForSize)
5529	return 1;
5530
5531	// We used the distance for the unroll factor.
5532	if (Legal->getMaxSafeDepDistBytes() != -1U)
5533	return 1;
5534
5535	// Do not unroll loops with a relatively small trip count.
5536	unsigned TC = SE->getSmallConstantTripCount(TheLoop);
5537	if (TC > 1 && TC < TinyTripCountUnrollThreshold)
5538	return 1;
5539
5540	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5541	DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0)
5542	" registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0);
5543
5544	if (VF == 1) {
5545	if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5546	TargetNumRegisters = ForceTargetNumScalarRegs;
5547	} else {
5548	if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5549	TargetNumRegisters = ForceTargetNumVectorRegs;
5550	}
5551
5552	LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
5553	// We divide by these constants so assume that we have at least one
5554	// instruction that uses at least one register.
5555	R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5556	R.NumInstructions = std::max(R.NumInstructions, 1U);
5557
5558	// We calculate the unroll factor using the following formula.
5559	// Subtract the number of loop invariants from the number of available
5560	// registers. These registers are used by all of the unrolled instances.
5561	// Next, divide the remaining registers by the number of registers that is
5562	// required by the loop, in order to estimate how many parallel instances
5563	// fit without causing spills. All of this is rounded down if necessary to be
5564	// a power of two. We want power of two unroll factors to simplify any
5565	// addressing operations or alignment considerations.
5566	unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5567	R.MaxLocalUsers);
5568
5569	// Don't count the induction variable as unrolled.
5570	if (EnableIndVarRegisterHeur)
5571	UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5572	std::max(1U, (R.MaxLocalUsers - 1)));
5573
5574	// Clamp the unroll factor ranges to reasonable factors.
5575	unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
5576
5577	// Check if the user has overridden the unroll max.
5578	if (VF == 1) {
5579	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5580	MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
5581	} else {
5582	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5583	MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
5584	}
5585
5586	// If we did not calculate the cost for VF (because the user selected the VF)
5587	// then we calculate the cost of VF here.
5588	if (LoopCost == 0)
5589	LoopCost = expectedCost(VF);
5590
5591	// Clamp the calculated UF to be between the 1 and the max unroll factor
5592	// that the target allows.
5593	if (UF > MaxInterleaveSize)
5594	UF = MaxInterleaveSize;
5595	else if (UF < 1)
5596	UF = 1;
5597
5598	// Unroll if we vectorized this loop and there is a reduction that could
5599	// benefit from unrolling.
5600	if (VF > 1 && Legal->getReductionVars()->size()) {
5601	DEBUG(dbgs() << "LV: Unrolling because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling because of reductions.\n" ; } } while (0);
5602	return UF;
5603	}
5604
5605	// Note that if we've already vectorized the loop we will have done the
5606	// runtime check and so unrolling won't require further checks.
5607	bool UnrollingRequiresRuntimePointerCheck =
5608	(VF == 1 && Legal->getRuntimePointerCheck()->Need);
5609
5610	// We want to unroll small loops in order to reduce the loop overhead and
5611	// potentially expose ILP opportunities.
5612	DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n'; } } while (0);
5613	if (!UnrollingRequiresRuntimePointerCheck &&
5614	LoopCost < SmallLoopCost) {
5615	// We assume that the cost overhead is 1 and we use the cost model
5616	// to estimate the cost of the loop and unroll until the cost of the
5617	// loop overhead is about 5% of the cost of the loop.
5618	unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5619
5620	// Unroll until store/load ports (estimated by max unroll factor) are
5621	// saturated.
5622	unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
5623	unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
5624
5625	// If we have a scalar reduction (vector reductions are already dealt with
5626	// by this point), we can increase the critical path length if the loop
5627	// we're unrolling is inside another loop. Limit, by default to 2, so the
5628	// critical path only gets increased by one reduction operation.
5629	if (Legal->getReductionVars()->size() &&
5630	TheLoop->getLoopDepth() > 1) {
5631	unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
5632	SmallUF = std::min(SmallUF, F);
5633	StoresUF = std::min(StoresUF, F);
5634	LoadsUF = std::min(LoadsUF, F);
5635	}
5636
5637	if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
5638	DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to saturate store or load ports.\n" ; } } while (0);
5639	return std::max(StoresUF, LoadsUF);
5640	}
5641
5642	DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to reduce branch cost.\n" ; } } while (0);
5643	return SmallUF;
5644	}
5645
5646	DEBUG(dbgs() << "LV: Not Unrolling.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not Unrolling.\n"; } } while (0);
5647	return 1;
5648	}
5649
5650	LoopVectorizationCostModel::RegisterUsage
5651	LoopVectorizationCostModel::calculateRegisterUsage() {
5652	// This function calculates the register usage by measuring the highest number
5653	// of values that are alive at a single location. Obviously, this is a very
5654	// rough estimation. We scan the loop in a topological order in order and
5655	// assign a number to each instruction. We use RPO to ensure that defs are
5656	// met before their users. We assume that each instruction that has in-loop
5657	// users starts an interval. We record every time that an in-loop value is
5658	// used, so we have a list of the first and last occurrences of each
5659	// instruction. Next, we transpose this data structure into a multi map that
5660	// holds the list of intervals that end at a specific location. This multi
5661	// map allows us to perform a linear search. We scan the instructions linearly
5662	// and record each time that a new interval starts, by placing it in a set.
5663	// If we find this value in the multi-map then we remove it from the set.
5664	// The max register usage is the maximum size of the set.
5665	// We also search for instructions that are defined outside the loop, but are
5666	// used inside the loop. We need this number separately from the max-interval
5667	// usage number because when we unroll, loop-invariant values do not take
5668	// more register.
5669	LoopBlocksDFS DFS(TheLoop);
5670	DFS.perform(LI);
5671
5672	RegisterUsage R;
5673	R.NumInstructions = 0;
5674
5675	// Each 'key' in the map opens a new interval. The values
5676	// of the map are the index of the 'last seen' usage of the
5677	// instruction that is the key.
5678	typedef DenseMap<Instruction*, unsigned> IntervalMap;
5679	// Maps instruction to its index.
5680	DenseMap<unsigned, Instruction*> IdxToInstr;
5681	// Marks the end of each interval.
5682	IntervalMap EndPoint;
5683	// Saves the list of instruction indices that are used in the loop.
5684	SmallSet<Instruction*, 8> Ends;
5685	// Saves the list of values that are used in the loop but are
5686	// defined outside the loop, such as arguments and constants.
5687	SmallPtrSet<Value*, 8> LoopInvariants;
5688
5689	unsigned Index = 0;
5690	for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
5691	be = DFS.endRPO(); bb != be; ++bb) {
5692	R.NumInstructions += (*bb)->size();
5693	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
5694	++it) {
5695	Instruction *I = it;
5696	IdxToInstr[Index++] = I;
5697
5698	// Save the end location of each USE.
5699	for (unsigned i = 0; i < I->getNumOperands(); ++i) {
5700	Value *U = I->getOperand(i);
5701	Instruction *Instr = dyn_cast<Instruction>(U);
5702
5703	// Ignore non-instruction values such as arguments, constants, etc.
5704	if (!Instr) continue;
5705
5706	// If this instruction is outside the loop then record it and continue.
5707	if (!TheLoop->contains(Instr)) {
5708	LoopInvariants.insert(Instr);
5709	continue;
5710	}
5711
5712	// Overwrite previous end points.
5713	EndPoint[Instr] = Index;
5714	Ends.insert(Instr);
5715	}
5716	}
5717	}
5718
5719	// Saves the list of intervals that end with the index in 'key'.
5720	typedef SmallVector<Instruction*, 2> InstrList;
5721	DenseMap<unsigned, InstrList> TransposeEnds;
5722
5723	// Transpose the EndPoints to a list of values that end at each index.
5724	for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
5725	it != e; ++it)
5726	TransposeEnds[it->second].push_back(it->first);
5727
5728	SmallSet<Instruction*, 8> OpenIntervals;
5729	unsigned MaxUsage = 0;
5730
5731
5732	DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n" ; } } while (0);
5733	for (unsigned int i = 0; i < Index; ++i) {
5734	Instruction *I = IdxToInstr[i];
5735	// Ignore instructions that are never used within the loop.
5736	if (!Ends.count(I)) continue;
5737
5738	// Ignore ephemeral values.
5739	if (EphValues.count(I))
5740	continue;
5741
5742	// Remove all of the instructions that end at this location.
5743	InstrList &List = TransposeEnds[i];
5744	for (unsigned int j=0, e = List.size(); j < e; ++j)
5745	OpenIntervals.erase(List[j]);
5746
5747	// Count the number of live interals.
5748	MaxUsage = std::max(MaxUsage, OpenIntervals.size());
5749
5750	DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0)
5751	OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0);
5752
5753	// Add the current instruction to the list of open intervals.
5754	OpenIntervals.insert(I);
5755	}
5756
5757	unsigned Invariant = LoopInvariants.size();
5758	DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'; } } while (0);
5759	DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'; } } while (0);
5760	DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'; } } while (0);
5761
5762	R.LoopInvariantRegs = Invariant;
5763	R.MaxLocalUsers = MaxUsage;
5764	return R;
5765	}
5766
5767	unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
5768	unsigned Cost = 0;
5769
5770	// For each block.
5771	for (Loop::block_iterator bb = TheLoop->block_begin(),
5772	be = TheLoop->block_end(); bb != be; ++bb) {
5773	unsigned BlockCost = 0;
5774	BasicBlock BB = bb;
5775
5776	// For each instruction in the old loop.
5777	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5778	// Skip dbg intrinsics.
5779	if (isa<DbgInfoIntrinsic>(it))
5780	continue;
5781
5782	// Ignore ephemeral values.
5783	if (EphValues.count(it))
5784	continue;
5785
5786	unsigned C = getInstructionCost(it, VF);
5787
5788	// Check if we should override the cost.
5789	if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5790	C = ForceTargetInstructionCost;
5791
5792	BlockCost += C;
5793	DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0)
5794	VF << " For instruction: " << it << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << it << '\n'; } } while (0);
5795	}
5796
5797	// We assume that if-converted blocks have a 50% chance of being executed.
5798	// When the code is scalar then some of the blocks are avoided due to CF.
5799	// When the code is vectorized we execute all code paths.
5800	if (VF == 1 && Legal->blockNeedsPredication(*bb))
5801	BlockCost /= 2;
5802
5803	Cost += BlockCost;
5804	}
5805
5806	return Cost;
5807	}
5808
5809	/// \brief Check whether the address computation for a non-consecutive memory
5810	/// access looks like an unlikely candidate for being merged into the indexing
5811	/// mode.
5812	///
5813	/// We look for a GEP which has one index that is an induction variable and all
5814	/// other indices are loop invariant. If the stride of this access is also
5815	/// within a small bound we decide that this address computation can likely be
5816	/// merged into the addressing mode.
5817	/// In all other cases, we identify the address computation as complex.
5818	static bool isLikelyComplexAddressComputation(Value *Ptr,
5819	LoopVectorizationLegality *Legal,
5820	ScalarEvolution *SE,
5821	const Loop *TheLoop) {
5822	GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5823	if (!Gep)
5824	return true;
5825
5826	// We are looking for a gep with all loop invariant indices except for one
5827	// which should be an induction variable.
5828	unsigned NumOperands = Gep->getNumOperands();
5829	for (unsigned i = 1; i < NumOperands; ++i) {
5830	Value *Opd = Gep->getOperand(i);
5831	if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5832	!Legal->isInductionVariable(Opd))
5833	return true;
5834	}
5835
5836	// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
5837	// can likely be merged into the address computation.
5838	unsigned MaxMergeDistance = 64;
5839
5840	const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
5841	if (!AddRec)
5842	return true;
5843
5844	// Check the step is constant.
5845	const SCEV Step = AddRec->getStepRecurrence(SE);
5846	// Calculate the pointer stride and check if it is consecutive.
5847	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
5848	if (!C)
5849	return true;
5850
5851	const APInt &APStepVal = C->getValue()->getValue();
5852
5853	// Huge step value - give up.
5854	if (APStepVal.getBitWidth() > 64)
5855	return true;
5856
5857	int64_t StepVal = APStepVal.getSExtValue();
5858
5859	return StepVal > MaxMergeDistance;
5860	}
5861
5862	static bool isStrideMul(Instruction I, LoopVectorizationLegality Legal) {
5863	if (Legal->hasStride(I->getOperand(0)) \|\| Legal->hasStride(I->getOperand(1)))
5864	return true;
5865	return false;
5866	}
5867
5868	unsigned
5869	LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5870	// If we know that this instruction will remain uniform, check the cost of
5871	// the scalar version.
5872	if (Legal->isUniformAfterVectorization(I))
5873	VF = 1;
5874
5875	Type *RetTy = I->getType();
5876	Type *VectorTy = ToVectorTy(RetTy, VF);
5877
5878	// TODO: We need to estimate the cost of intrinsic calls.
5879	switch (I->getOpcode()) {
5880	case Instruction::GetElementPtr:
5881	// We mark this instruction as zero-cost because the cost of GEPs in
5882	// vectorized code depends on whether the corresponding memory instruction
5883	// is scalarized or not. Therefore, we handle GEPs with the memory
5884	// instruction cost.
5885	return 0;
5886	case Instruction::Br: {
5887	return TTI.getCFInstrCost(I->getOpcode());
5888	}
5889	case Instruction::PHI:
5890	//TODO: IF-converted IFs become selects.
5891	return 0;
5892	case Instruction::Add:
5893	case Instruction::FAdd:
5894	case Instruction::Sub:
5895	case Instruction::FSub:
5896	case Instruction::Mul:
5897	case Instruction::FMul:
5898	case Instruction::UDiv:
5899	case Instruction::SDiv:
5900	case Instruction::FDiv:
5901	case Instruction::URem:
5902	case Instruction::SRem:
5903	case Instruction::FRem:
5904	case Instruction::Shl:
5905	case Instruction::LShr:
5906	case Instruction::AShr:
5907	case Instruction::And:
5908	case Instruction::Or:
5909	case Instruction::Xor: {
5910	// Since we will replace the stride by 1 the multiplication should go away.
5911	if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5912	return 0;
5913	// Certain instructions can be cheaper to vectorize if they have a constant
5914	// second vector operand. One example of this are shifts on x86.
5915	TargetTransformInfo::OperandValueKind Op1VK =
5916	TargetTransformInfo::OK_AnyValue;
5917	TargetTransformInfo::OperandValueKind Op2VK =
5918	TargetTransformInfo::OK_AnyValue;
5919	TargetTransformInfo::OperandValueProperties Op1VP =
5920	TargetTransformInfo::OP_None;
5921	TargetTransformInfo::OperandValueProperties Op2VP =
5922	TargetTransformInfo::OP_None;
5923	Value *Op2 = I->getOperand(1);
5924
5925	// Check for a splat of a constant or for a non uniform vector of constants.
5926	if (isa<ConstantInt>(Op2)) {
5927	ConstantInt *CInt = cast<ConstantInt>(Op2);
5928	if (CInt && CInt->getValue().isPowerOf2())
5929	Op2VP = TargetTransformInfo::OP_PowerOf2;
5930	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
5931	} else if (isa<ConstantVector>(Op2) \|\| isa<ConstantDataVector>(Op2)) {
5932	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
5933	Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
5934	if (SplatValue) {
5935	ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
5936	if (CInt && CInt->getValue().isPowerOf2())
5937	Op2VP = TargetTransformInfo::OP_PowerOf2;
5938	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
5939	}
5940	}
5941
5942	return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
5943	Op1VP, Op2VP);
5944	}
5945	case Instruction::Select: {
5946	SelectInst *SI = cast<SelectInst>(I);
5947	const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5948	bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5949	Type *CondTy = SI->getCondition()->getType();
5950	if (!ScalarCond)
5951	CondTy = VectorType::get(CondTy, VF);
5952
5953	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
5954	}
5955	case Instruction::ICmp:
5956	case Instruction::FCmp: {
5957	Type *ValTy = I->getOperand(0)->getType();
5958	VectorTy = ToVectorTy(ValTy, VF);
5959	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
5960	}
5961	case Instruction::Store:
5962	case Instruction::Load: {
5963	StoreInst *SI = dyn_cast<StoreInst>(I);
5964	LoadInst *LI = dyn_cast<LoadInst>(I);
5965	Type *ValTy = (SI ? SI->getValueOperand()->getType() :
5966	LI->getType());
5967	VectorTy = ToVectorTy(ValTy, VF);
5968
5969	unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
5970	unsigned AS = SI ? SI->getPointerAddressSpace() :
5971	LI->getPointerAddressSpace();
5972	Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
5973	// We add the cost of address computation here instead of with the gep
5974	// instruction because only here we know whether the operation is
5975	// scalarized.
5976	if (VF == 1)
5977	return TTI.getAddressComputationCost(VectorTy) +
5978	TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5979
5980	// Scalarized loads/stores.
5981	int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5982	bool Reverse = ConsecutiveStride < 0;
5983	unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
5984	unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
5985	if (!ConsecutiveStride \|\| ScalarAllocatedSize != VectorElementSize) {
5986	bool IsComplexComputation =
5987	isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
5988	unsigned Cost = 0;
5989	// The cost of extracting from the value vector and pointer vector.
5990	Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5991	for (unsigned i = 0; i < VF; ++i) {
5992	// The cost of extracting the pointer operand.
5993	Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
5994	// In case of STORE, the cost of ExtractElement from the vector.
5995	// In case of LOAD, the cost of InsertElement into the returned
5996	// vector.
5997	Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
5998	Instruction::InsertElement,
5999	VectorTy, i);
6000	}
6001
6002	// The cost of the scalar loads/stores.
6003	Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
6004	Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
6005	Alignment, AS);
6006	return Cost;
6007	}
6008
6009	// Wide load/stores.
6010	unsigned Cost = TTI.getAddressComputationCost(VectorTy);
6011	Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
6012
6013	if (Reverse)
6014	Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
6015	VectorTy, 0);
6016	return Cost;
6017	}
6018	case Instruction::ZExt:
6019	case Instruction::SExt:
6020	case Instruction::FPToUI:
6021	case Instruction::FPToSI:
6022	case Instruction::FPExt:
6023	case Instruction::PtrToInt:
6024	case Instruction::IntToPtr:
6025	case Instruction::SIToFP:
6026	case Instruction::UIToFP:
6027	case Instruction::Trunc:
6028	case Instruction::FPTrunc:
6029	case Instruction::BitCast: {
6030	// We optimize the truncation of induction variable.
6031	// The cost of these is the same as the scalar operation.
6032	if (I->getOpcode() == Instruction::Trunc &&
6033	Legal->isInductionVariable(I->getOperand(0)))
6034	return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
6035	I->getOperand(0)->getType());
6036
6037	Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
6038	return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
6039	}
6040	case Instruction::Call: {
6041	CallInst *CI = cast<CallInst>(I);
6042	Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
6043	assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6043, __PRETTY_FUNCTION__));
6044	Type *RetTy = ToVectorTy(CI->getType(), VF);
6045	SmallVector<Type*, 4> Tys;
6046	for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
6047	Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
6048	return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
6049	}
6050	default: {
6051	// We are scalarizing the instruction. Return the cost of the scalar
6052	// instruction, plus the cost of insert and extract into vector
6053	// elements, times the vector width.
6054	unsigned Cost = 0;
6055
6056	if (!RetTy->isVoidTy() && VF != 1) {
6057	unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
6058	VectorTy);
6059	unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
6060	VectorTy);
6061
6062	// The cost of inserting the results plus extracting each one of the
6063	// operands.
6064	Cost += VF * (InsCost + ExtCost * I->getNumOperands());
6065	}
6066
6067	// The cost of executing VF copies of the scalar instruction. This opcode
6068	// is unknown. Assume that it is the same as 'mul'.
6069	Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6070	return Cost;
6071	}
6072	}// end of switch.
6073	}
6074
6075	Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
6076	if (Scalar->isVoidTy() \|\| VF == 1)
6077	return Scalar;
6078	return VectorType::get(Scalar, VF);
6079	}
6080
6081	char LoopVectorize::ID = 0;
6082	static const char lv_name[] = "Loop Vectorization";
6083	INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void* initializeLoopVectorizePassOnce(PassRegistry & Registry) {
6084	INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)initializeTargetTransformInfoAnalysisGroup(Registry);
6085	INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry);
6086	INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)initializeAssumptionTrackerPass(Registry);
6087	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)initializeBlockFrequencyInfoPass(Registry);
6088	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6089	INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry);
6090	INITIALIZE_PASS_DEPENDENCY(LCSSA)initializeLCSSAPass(Registry);
6091	INITIALIZE_PASS_DEPENDENCY(LoopInfo)initializeLoopInfoPass(Registry);
6092	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);
6093	INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo PI = new PassInfo(lv_name, "loop-vectorize", & LoopVectorize ::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize >), false, false); Registry.registerPass(PI, true); return PI; } void llvm::initializeLoopVectorizePass(PassRegistry & Registry) { static volatile sys::cas_flag initialized = 0; sys ::cas_flag old_val = sys::CompareAndSwap(&initialized, 1, 0); if (old_val == 0) { initializeLoopVectorizePassOnce(Registry ); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093, &initialized); initialized = 2; AnnotateIgnoreWritesEnd ("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093); } else { sys::cas_flag tmp = initialized; sys::MemoryFence (); while (tmp != 2) { tmp = initialized; sys::MemoryFence(); } } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093, &initialized); }
6094
6095	namespace llvm {
6096	Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
6097	return new LoopVectorize(NoUnrolling, AlwaysVectorize);
6098	}
6099	}
6100
6101	bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6102	// Check for a store.
6103	if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
6104	return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
6105
6106	// Check for a load.
6107	if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
6108	return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
6109
6110	return false;
6111	}
6112
6113
6114	void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
6115	bool IfPredicateStore) {
6116	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6116, __PRETTY_FUNCTION__));
6117	// Holds vector parameters or scalars, in case of uniform vals.
6118	SmallVector<VectorParts, 4> Params;
6119
6120	setDebugLocFromInst(Builder, Instr);
6121
6122	// Find all of the vectorized parameters.
6123	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
6124	Value *SrcOp = Instr->getOperand(op);
6125
6126	// If we are accessing the old induction variable, use the new one.
6127	if (SrcOp == OldInduction) {
6128	Params.push_back(getVectorValue(SrcOp));
6129	continue;
6130	}
6131
6132	// Try using previously calculated values.
6133	Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
6134
6135	// If the src is an instruction that appeared earlier in the basic block
6136	// then it should already be vectorized.
6137	if (SrcInst && OrigLoop->contains(SrcInst)) {
6138	assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6138, __PRETTY_FUNCTION__));
6139	// The parameter is a vector value from earlier.
6140	Params.push_back(WidenMap.get(SrcInst));
6141	} else {
6142	// The parameter is a scalar from outside the loop. Maybe even a constant.
6143	VectorParts Scalars;
6144	Scalars.append(UF, SrcOp);
6145	Params.push_back(Scalars);
6146	}
6147	}
6148
6149	assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6150, __PRETTY_FUNCTION__))
6150	"Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6150, __PRETTY_FUNCTION__));
6151
6152	// Does this instruction return a value ?
6153	bool IsVoidRetTy = Instr->getType()->isVoidTy();
6154
6155	Value *UndefVec = IsVoidRetTy ? nullptr :
6156	UndefValue::get(Instr->getType());
6157	// Create a new entry in the WidenMap and initialize it to Undef or Null.
6158	VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
6159
6160	Instruction *InsertPt = Builder.GetInsertPoint();
6161	BasicBlock *IfBlock = Builder.GetInsertBlock();
6162	BasicBlock *CondBlock = nullptr;
6163
6164	VectorParts Cond;
6165	Loop *VectorLp = nullptr;
6166	if (IfPredicateStore) {
6167	assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6168, __PRETTY_FUNCTION__))
6168	"Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6168, __PRETTY_FUNCTION__));
6169	Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
6170	Instr->getParent());
6171	VectorLp = LI->getLoopFor(IfBlock);
6172	assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6172, __PRETTY_FUNCTION__));
6173	}
6174
6175	// For each vector unroll 'part':
6176	for (unsigned Part = 0; Part < UF; ++Part) {
6177	// For each scalar that we create:
6178
6179	// Start an "if (pred) a[i] = ..." block.
6180	Value *Cmp = nullptr;
6181	if (IfPredicateStore) {
6182	if (Cond[Part]->getType()->isVectorTy())
6183	Cond[Part] =
6184	Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
6185	Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
6186	ConstantInt::get(Cond[Part]->getType(), 1));
6187	CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
6188	LoopVectorBody.push_back(CondBlock);
6189	VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
6190	// Update Builder with newly created basic block.
6191	Builder.SetInsertPoint(InsertPt);
6192	}
6193
6194	Instruction *Cloned = Instr->clone();
6195	if (!IsVoidRetTy)
6196	Cloned->setName(Instr->getName() + ".cloned");
6197	// Replace the operands of the cloned instructions with extracted scalars.
6198	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
6199	Value *Op = Params[op][Part];
6200	Cloned->setOperand(op, Op);
6201	}
6202
6203	// Place the cloned scalar in the new loop.
6204	Builder.Insert(Cloned);
6205
6206	// If the original scalar returns a value we need to place it in a vector
6207	// so that future users will be able to use it.
6208	if (!IsVoidRetTy)
6209	VecResults[Part] = Cloned;
6210
6211	// End if-block.
6212	if (IfPredicateStore) {
6213	BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
6214	LoopVectorBody.push_back(NewIfBlock);
6215	VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
6216	Builder.SetInsertPoint(InsertPt);
6217	Instruction *OldBr = IfBlock->getTerminator();
6218	BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
6219	OldBr->eraseFromParent();
6220	IfBlock = NewIfBlock;
6221	}
6222	}
6223	}
6224
6225	void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
6226	StoreInst *SI = dyn_cast<StoreInst>(Instr);
6227	bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
6228
6229	return scalarizeInstruction(Instr, IfPredicateStore);
6230	}
6231
6232	Value InnerLoopUnroller::reverseVector(Value Vec) {
6233	return Vec;
6234	}
6235
6236	Value InnerLoopUnroller::getBroadcastInstrs(Value V) {
6237	return V;
6238	}
6239
6240	Value InnerLoopUnroller::getConsecutiveVector(Value Val, int StartIdx,
6241	bool Negate) {
6242	// When unrolling and the VF is 1, we only need to add a simple scalar.
6243	Type *ITy = Val->getType();
6244	assert(!ITy->isVectorTy() && "Val must be a scalar")((!ITy->isVectorTy() && "Val must be a scalar") ? static_cast <void> (0) : __assert_fail ("!ITy->isVectorTy() && \"Val must be a scalar\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6244, __PRETTY_FUNCTION__));
6245	Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
6246	return Builder.CreateAdd(Val, C, "induction");
6247	}