/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	lib/Transforms/Vectorize/LoopVectorize.cpp
Location:	line 1219, column 5
Description:	Value stored to 'LoopID' is never read

Annotated Source Code

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// The LLVM Compiler Infrastructure
4	//
5	// This file is distributed under the University of Illinois Open Source
6	// License. See LICENSE.TXT for details.
7	//
8	//===----------------------------------------------------------------------===//
9	//
10	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11	// and generates target-independent LLVM-IR.
12	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13	// of instructions in order to estimate the profitability of vectorization.
14	//
15	// The loop vectorizer combines consecutive loop iterations into a single
16	// 'wide' iteration. After this transformation the index is incremented
17	// by the SIMD vector width, and not by one.
18	//
19	// This pass has three parts:
20	// 1. The main loop pass that drives the different parts.
21	// 2. LoopVectorizationLegality - A unit that checks for the legality
22	// of the vectorization.
23	// 3. InnerLoopVectorizer - A unit that performs the actual
24	// widening of instructions.
25	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26	// of vectorization. It decides on the optimal vector width, which
27	// can be one, if vectorization is not profitable.
28	//
29	//===----------------------------------------------------------------------===//
30	//
31	// The reduction-variable vectorization is based on the paper:
32	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33	//
34	// Variable uniformity checks are inspired by:
35	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
36	//
37	// Other ideas/concepts are from:
38	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
39	//
40	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
41	// Vectorizing Compilers.
42	//
43	//===----------------------------------------------------------------------===//
44
45	#include "llvm/Transforms/Vectorize.h"
46	#include "llvm/ADT/DenseMap.h"
47	#include "llvm/ADT/EquivalenceClasses.h"
48	#include "llvm/ADT/Hashing.h"
49	#include "llvm/ADT/MapVector.h"
50	#include "llvm/ADT/SetVector.h"
51	#include "llvm/ADT/SmallPtrSet.h"
52	#include "llvm/ADT/SmallSet.h"
53	#include "llvm/ADT/SmallVector.h"
54	#include "llvm/ADT/Statistic.h"
55	#include "llvm/ADT/StringExtras.h"
56	#include "llvm/Analysis/AliasAnalysis.h"
57	#include "llvm/Analysis/AliasSetTracker.h"
58	#include "llvm/Analysis/AssumptionTracker.h"
59	#include "llvm/Analysis/BlockFrequencyInfo.h"
60	#include "llvm/Analysis/CodeMetrics.h"
61	#include "llvm/Analysis/LoopInfo.h"
62	#include "llvm/Analysis/LoopIterator.h"
63	#include "llvm/Analysis/LoopPass.h"
64	#include "llvm/Analysis/ScalarEvolution.h"
65	#include "llvm/Analysis/ScalarEvolutionExpander.h"
66	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
67	#include "llvm/Analysis/TargetTransformInfo.h"
68	#include "llvm/Analysis/ValueTracking.h"
69	#include "llvm/IR/Constants.h"
70	#include "llvm/IR/DataLayout.h"
71	#include "llvm/IR/DebugInfo.h"
72	#include "llvm/IR/DerivedTypes.h"
73	#include "llvm/IR/DiagnosticInfo.h"
74	#include "llvm/IR/Dominators.h"
75	#include "llvm/IR/Function.h"
76	#include "llvm/IR/IRBuilder.h"
77	#include "llvm/IR/Instructions.h"
78	#include "llvm/IR/IntrinsicInst.h"
79	#include "llvm/IR/LLVMContext.h"
80	#include "llvm/IR/Module.h"
81	#include "llvm/IR/PatternMatch.h"
82	#include "llvm/IR/Type.h"
83	#include "llvm/IR/Value.h"
84	#include "llvm/IR/ValueHandle.h"
85	#include "llvm/IR/Verifier.h"
86	#include "llvm/Pass.h"
87	#include "llvm/Support/BranchProbability.h"
88	#include "llvm/Support/CommandLine.h"
89	#include "llvm/Support/Debug.h"
90	#include "llvm/Support/raw_ostream.h"
91	#include "llvm/Transforms/Scalar.h"
92	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
93	#include "llvm/Transforms/Utils/Local.h"
94	#include "llvm/Transforms/Utils/VectorUtils.h"
95	#include <algorithm>
96	#include <map>
97	#include <tuple>
98
99	using namespace llvm;
100	using namespace llvm::PatternMatch;
101
102	#define LV_NAME"loop-vectorize" "loop-vectorize"
103	#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
104
105	STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = { "loop-vectorize", "Number of loops vectorized" , 0, 0 };
106	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = { "loop-vectorize", "Number of loops analyzed for vectorization" , 0, 0 };
107
108	static cl::opt<unsigned>
109	VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
110	cl::desc("Sets the SIMD width. Zero is autoselect."));
111
112	static cl::opt<unsigned>
113	VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
114	cl::desc("Sets the vectorization interleave count. "
115	"Zero is autoselect."));
116
117	static cl::opt<bool>
118	EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
119	cl::desc("Enable if-conversion during vectorization."));
120
121	/// We don't vectorize loops with a known constant trip count below this number.
122	static cl::opt<unsigned>
123	TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
124	cl::Hidden,
125	cl::desc("Don't vectorize loops with a constant "
126	"trip count that is smaller than this "
127	"value."));
128
129	/// This enables versioning on the strides of symbolically striding memory
130	/// accesses in code like the following.
131	/// for (i = 0; i < N; ++i)
132	/// A[i * Stride1] += B[i * Stride2] ...
133	///
134	/// Will be roughly translated to
135	/// if (Stride1 == 1 && Stride2 == 1) {
136	/// for (i = 0; i < N; i+=4)
137	/// A[i:i+3] += ...
138	/// } else
139	/// ...
140	static cl::opt<bool> EnableMemAccessVersioning(
141	"enable-mem-access-versioning", cl::init(true), cl::Hidden,
142	cl::desc("Enable symblic stride memory access versioning"));
143
144	/// We don't unroll loops with a known constant trip count below this number.
145	static const unsigned TinyTripCountUnrollThreshold = 128;
146
147	/// When performing memory disambiguation checks at runtime do not make more
148	/// than this number of comparisons.
149	static const unsigned RuntimeMemoryCheckThreshold = 8;
150
151	/// Maximum simd width.
152	static const unsigned MaxVectorWidth = 64;
153
154	static cl::opt<unsigned> ForceTargetNumScalarRegs(
155	"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
156	cl::desc("A flag that overrides the target's number of scalar registers."));
157
158	static cl::opt<unsigned> ForceTargetNumVectorRegs(
159	"force-target-num-vector-regs", cl::init(0), cl::Hidden,
160	cl::desc("A flag that overrides the target's number of vector registers."));
161
162	/// Maximum vectorization interleave count.
163	static const unsigned MaxInterleaveFactor = 16;
164
165	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
166	"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
167	cl::desc("A flag that overrides the target's max interleave factor for "
168	"scalar loops."));
169
170	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
171	"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
172	cl::desc("A flag that overrides the target's max interleave factor for "
173	"vectorized loops."));
174
175	static cl::opt<unsigned> ForceTargetInstructionCost(
176	"force-target-instruction-cost", cl::init(0), cl::Hidden,
177	cl::desc("A flag that overrides the target's expected cost for "
178	"an instruction to a single constant value. Mostly "
179	"useful for getting consistent testing."));
180
181	static cl::opt<unsigned> SmallLoopCost(
182	"small-loop-cost", cl::init(20), cl::Hidden,
183	cl::desc("The cost of a loop that is considered 'small' by the unroller."));
184
185	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
186	"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
187	cl::desc("Enable the use of the block frequency analysis to access PGO "
188	"heuristics minimizing code growth in cold regions and being more "
189	"aggressive in hot regions."));
190
191	// Runtime unroll loops for load/store throughput.
192	static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
193	"enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
194	cl::desc("Enable runtime unrolling until load/store ports are saturated"));
195
196	/// The number of stores in a loop that are allowed to need predication.
197	static cl::opt<unsigned> NumberOfStoresToPredicate(
198	"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
199	cl::desc("Max number of stores to be predicated behind an if."));
200
201	static cl::opt<bool> EnableIndVarRegisterHeur(
202	"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
203	cl::desc("Count the induction variable only once when unrolling"));
204
205	static cl::opt<bool> EnableCondStoresVectorization(
206	"enable-cond-stores-vec", cl::init(false), cl::Hidden,
207	cl::desc("Enable if predication of stores during vectorization."));
208
209	static cl::opt<unsigned> MaxNestedScalarReductionUF(
210	"max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
211	cl::desc("The maximum unroll factor to use when unrolling a scalar "
212	"reduction in a nested loop."));
213
214	namespace {
215
216	// Forward declarations.
217	class LoopVectorizationLegality;
218	class LoopVectorizationCostModel;
219	class LoopVectorizeHints;
220
221	/// Optimization analysis message produced during vectorization. Messages inform
222	/// the user why vectorization did not occur.
223	class Report {
224	std::string Message;
225	raw_string_ostream Out;
226	Instruction *Instr;
227
228	public:
229	Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
230	Out << "loop not vectorized: ";
231	}
232
233	template <typename A> Report &operator<<(const A &Value) {
234	Out << Value;
235	return *this;
236	}
237
238	Instruction *getInstr() { return Instr; }
239
240	std::string &str() { return Out.str(); }
241	operator Twine() { return Out.str(); }
242	};
243
244	/// InnerLoopVectorizer vectorizes loops which contain only one basic
245	/// block to a specified vectorization factor (VF).
246	/// This class performs the widening of scalars into vectors, or multiple
247	/// scalars. This class also implements the following features:
248	/// * It inserts an epilogue loop for handling loops that don't have iteration
249	/// counts that are known to be a multiple of the vectorization factor.
250	/// * It handles the code generation for reduction variables.
251	/// * Scalarization (implementation using scalars) of un-vectorizable
252	/// instructions.
253	/// InnerLoopVectorizer does not perform any vectorization-legality
254	/// checks, and relies on the caller to check for the different legality
255	/// aspects. The InnerLoopVectorizer relies on the
256	/// LoopVectorizationLegality class to provide information about the induction
257	/// and reduction variables that were found to a given vectorization factor.
258	class InnerLoopVectorizer {
259	public:
260	InnerLoopVectorizer(Loop OrigLoop, ScalarEvolution SE, LoopInfo *LI,
261	DominatorTree DT, const DataLayout DL,
262	const TargetLibraryInfo *TLI, unsigned VecWidth,
263	unsigned UnrollFactor)
264	: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
265	VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
266	Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
267	Legal(nullptr) {}
268
269	// Perform the actual loop widening (vectorization).
270	void vectorize(LoopVectorizationLegality *L) {
271	Legal = L;
272	// Create a new empty loop. Unlink the old loop and connect the new one.
273	createEmptyLoop();
274	// Widen each instruction in the old loop to a new one in the new loop.
275	// Use the Legality module to find the induction and reduction variables.
276	vectorizeLoop();
277	// Register the new loop and update the analysis passes.
278	updateAnalysis();
279	}
280
281	virtual ~InnerLoopVectorizer() {}
282
283	protected:
284	/// A small list of PHINodes.
285	typedef SmallVector<PHINode*, 4> PhiVector;
286	/// When we unroll loops we have multiple vector values for each scalar.
287	/// This data structure holds the unrolled and vectorized values that
288	/// originated from one scalar instruction.
289	typedef SmallVector<Value*, 2> VectorParts;
290
291	// When we if-convert we need create edge masks. We have to cache values so
292	// that we don't end up with exponential recursion/IR.
293	typedef DenseMap<std::pair<BasicBlock, BasicBlock>,
294	VectorParts> EdgeMaskCache;
295
296	/// \brief Add code that checks at runtime if the accessed arrays overlap.
297	///
298	/// Returns a pair of instructions where the first element is the first
299	/// instruction generated in possibly a sequence of instructions and the
300	/// second value is the final comparator value or NULL if no check is needed.
301	std::pair<Instruction , Instruction > addRuntimeCheck(Instruction *Loc);
302
303	/// \brief Add checks for strides that where assumed to be 1.
304	///
305	/// Returns the last check instruction and the first check instruction in the
306	/// pair as (first, last).
307	std::pair<Instruction , Instruction > addStrideCheck(Instruction *Loc);
308
309	/// Create an empty loop, based on the loop ranges of the old loop.
310	void createEmptyLoop();
311	/// Copy and widen the instructions from the old loop.
312	virtual void vectorizeLoop();
313
314	/// \brief The Loop exit block may have single value PHI nodes where the
315	/// incoming value is 'Undef'. While vectorizing we only handled real values
316	/// that were defined inside the loop. Here we fix the 'undef case'.
317	/// See PR14725.
318	void fixLCSSAPHIs();
319
320	/// A helper function that computes the predicate of the block BB, assuming
321	/// that the header block of the loop is set to True. It returns the entry
322	/// mask for the block BB.
323	VectorParts createBlockInMask(BasicBlock *BB);
324	/// A helper function that computes the predicate of the edge between SRC
325	/// and DST.
326	VectorParts createEdgeMask(BasicBlock Src, BasicBlock Dst);
327
328	/// A helper function to vectorize a single BB within the innermost loop.
329	void vectorizeBlockInLoop(BasicBlock BB, PhiVector PV);
330
331	/// Vectorize a single PHINode in a block. This method handles the induction
332	/// variable canonicalization. It supports both VF = 1 for unrolled loops and
333	/// arbitrary length vectors.
334	void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
335	unsigned UF, unsigned VF, PhiVector *PV);
336
337	/// Insert the new loop to the loop hierarchy and pass manager
338	/// and update the analysis passes.
339	void updateAnalysis();
340
341	/// This instruction is un-vectorizable. Implement it as a sequence
342	/// of scalars. If \p IfPredicateStore is true we need to 'hide' each
343	/// scalarized instruction behind an if block predicated on the control
344	/// dependence of the instruction.
345	virtual void scalarizeInstruction(Instruction *Instr,
346	bool IfPredicateStore=false);
347
348	/// Vectorize Load and Store instructions,
349	virtual void vectorizeMemoryInstruction(Instruction *Instr);
350
351	/// Create a broadcast instruction. This method generates a broadcast
352	/// instruction (shuffle) for loop invariant values and for the induction
353	/// value. If this is the induction variable then we extend it to N, N+1, ...
354	/// this is needed because each iteration in the loop corresponds to a SIMD
355	/// element.
356	virtual Value getBroadcastInstrs(Value V);
357
358	/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
359	/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
360	/// The sequence starts at StartIndex.
361	virtual Value getConsecutiveVector(Value Val, int StartIdx, bool Negate);
362
363	/// When we go over instructions in the basic block we rely on previous
364	/// values within the current basic block or on loop invariant values.
365	/// When we widen (vectorize) values we place them in the map. If the values
366	/// are not within the map, they have to be loop invariant, so we simply
367	/// broadcast them into a vector.
368	VectorParts &getVectorValue(Value *V);
369
370	/// Generate a shuffle sequence that will reverse the vector Vec.
371	virtual Value reverseVector(Value Vec);
372
373	/// This is a helper class that holds the vectorizer state. It maps scalar
374	/// instructions to vector instructions. When the code is 'unrolled' then
375	/// then a single scalar value is mapped to multiple vector parts. The parts
376	/// are stored in the VectorPart type.
377	struct ValueMap {
378	/// C'tor. UnrollFactor controls the number of vectors ('parts') that
379	/// are mapped.
380	ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
381
382	/// \return True if 'Key' is saved in the Value Map.
383	bool has(Value *Key) const { return MapStorage.count(Key); }
384
385	/// Initializes a new entry in the map. Sets all of the vector parts to the
386	/// save value in 'Val'.
387	/// \return A reference to a vector with splat values.
388	VectorParts &splat(Value Key, Value Val) {
389	VectorParts &Entry = MapStorage[Key];
390	Entry.assign(UF, Val);
391	return Entry;
392	}
393
394	///\return A reference to the value that is stored at 'Key'.
395	VectorParts &get(Value *Key) {
396	VectorParts &Entry = MapStorage[Key];
397	if (Entry.empty())
398	Entry.resize(UF);
399	assert(Entry.size() == UF)((Entry.size() == UF) ? static_cast<void> (0) : __assert_fail ("Entry.size() == UF", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 399, __PRETTY_FUNCTION__));
400	return Entry;
401	}
402
403	private:
404	/// The unroll factor. Each entry in the map stores this number of vector
405	/// elements.
406	unsigned UF;
407
408	/// Map storage. We use std::map and not DenseMap because insertions to a
409	/// dense map invalidates its iterators.
410	std::map<Value *, VectorParts> MapStorage;
411	};
412
413	/// The original loop.
414	Loop *OrigLoop;
415	/// Scev analysis to use.
416	ScalarEvolution *SE;
417	/// Loop Info.
418	LoopInfo *LI;
419	/// Dominator Tree.
420	DominatorTree *DT;
421	/// Alias Analysis.
422	AliasAnalysis *AA;
423	/// Data Layout.
424	const DataLayout *DL;
425	/// Target Library Info.
426	const TargetLibraryInfo *TLI;
427
428	/// The vectorization SIMD factor to use. Each vector will have this many
429	/// vector elements.
430	unsigned VF;
431
432	protected:
433	/// The vectorization unroll factor to use. Each scalar is vectorized to this
434	/// many different vector instructions.
435	unsigned UF;
436
437	/// The builder that we use
438	IRBuilder<> Builder;
439
440	// --- Vectorization state ---
441
442	/// The vector-loop preheader.
443	BasicBlock *LoopVectorPreHeader;
444	/// The scalar-loop preheader.
445	BasicBlock *LoopScalarPreHeader;
446	/// Middle Block between the vector and the scalar.
447	BasicBlock *LoopMiddleBlock;
448	///The ExitBlock of the scalar loop.
449	BasicBlock *LoopExitBlock;
450	///The vector loop body.
451	SmallVector<BasicBlock *, 4> LoopVectorBody;
452	///The scalar loop body.
453	BasicBlock *LoopScalarBody;
454	/// A list of all bypass blocks. The first block is the entry of the loop.
455	SmallVector<BasicBlock *, 4> LoopBypassBlocks;
456
457	/// The new Induction variable which was added to the new block.
458	PHINode *Induction;
459	/// The induction variable of the old basic block.
460	PHINode *OldInduction;
461	/// Holds the extended (to the widest induction type) start index.
462	Value *ExtendedIdx;
463	/// Maps scalars to widened vectors.
464	ValueMap WidenMap;
465	EdgeMaskCache MaskCache;
466
467	LoopVectorizationLegality *Legal;
468	};
469
470	class InnerLoopUnroller : public InnerLoopVectorizer {
471	public:
472	InnerLoopUnroller(Loop OrigLoop, ScalarEvolution SE, LoopInfo *LI,
473	DominatorTree DT, const DataLayout DL,
474	const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
475	InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
476
477	private:
478	void scalarizeInstruction(Instruction *Instr,
479	bool IfPredicateStore = false) override;
480	void vectorizeMemoryInstruction(Instruction *Instr) override;
481	Value getBroadcastInstrs(Value V) override;
482	Value getConsecutiveVector(Value Val, int StartIdx, bool Negate) override;
483	Value reverseVector(Value Vec) override;
484	};
485
486	/// \brief Look for a meaningful debug location on the instruction or it's
487	/// operands.
488	static Instruction getDebugLocFromInstOrOperands(Instruction I) {
489	if (!I)
490	return I;
491
492	DebugLoc Empty;
493	if (I->getDebugLoc() != Empty)
494	return I;
495
496	for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
497	if (Instruction OpInst = dyn_cast<Instruction>(OI))
498	if (OpInst->getDebugLoc() != Empty)
499	return OpInst;
500	}
501
502	return I;
503	}
504
505	/// \brief Set the debug location in the builder using the debug location in the
506	/// instruction.
507	static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
508	if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
509	B.SetCurrentDebugLocation(Inst->getDebugLoc());
510	else
511	B.SetCurrentDebugLocation(DebugLoc());
512	}
513
514	#ifndef NDEBUG
515	/// \return string containing a file name and a line # for the given loop.
516	static std::string getDebugLocString(const Loop *L) {
517	std::string Result;
518	if (L) {
519	raw_string_ostream OS(Result);
520	const DebugLoc LoopDbgLoc = L->getStartLoc();
521	if (!LoopDbgLoc.isUnknown())
522	LoopDbgLoc.print(L->getHeader()->getContext(), OS);
523	else
524	// Just print the module name.
525	OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
526	OS.flush();
527	}
528	return Result;
529	}
530	#endif
531
532	/// \brief Propagate known metadata from one instruction to another.
533	static void propagateMetadata(Instruction To, const Instruction From) {
534	SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
535	From->getAllMetadataOtherThanDebugLoc(Metadata);
536
537	for (auto M : Metadata) {
538	unsigned Kind = M.first;
539
540	// These are safe to transfer (this is safe for TBAA, even when we
541	// if-convert, because should that metadata have had a control dependency
542	// on the condition, and thus actually aliased with some other
543	// non-speculated memory access when the condition was false, this would be
544	// caught by the runtime overlap checks).
545	if (Kind != LLVMContext::MD_tbaa &&
546	Kind != LLVMContext::MD_alias_scope &&
547	Kind != LLVMContext::MD_noalias &&
548	Kind != LLVMContext::MD_fpmath)
549	continue;
550
551	To->setMetadata(Kind, M.second);
552	}
553	}
554
555	/// \brief Propagate known metadata from one instruction to a vector of others.
556	static void propagateMetadata(SmallVectorImpl<Value > &To, const Instruction From) {
557	for (Value *V : To)
558	if (Instruction *I = dyn_cast<Instruction>(V))
559	propagateMetadata(I, From);
560	}
561
562	/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
563	/// to what vectorization factor.
564	/// This class does not look at the profitability of vectorization, only the
565	/// legality. This class has two main kinds of checks:
566	/// * Memory checks - The code in canVectorizeMemory checks if vectorization
567	/// will change the order of memory accesses in a way that will change the
568	/// correctness of the program.
569	/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
570	/// checks for a number of different conditions, such as the availability of a
571	/// single induction variable, that all types are supported and vectorize-able,
572	/// etc. This code reflects the capabilities of InnerLoopVectorizer.
573	/// This class is also used by InnerLoopVectorizer for identifying
574	/// induction variable and the different reduction variables.
575	class LoopVectorizationLegality {
576	public:
577	unsigned NumLoads;
578	unsigned NumStores;
579	unsigned NumPredStores;
580
581	LoopVectorizationLegality(Loop L, ScalarEvolution SE, const DataLayout *DL,
582	DominatorTree DT, TargetLibraryInfo TLI,
583	AliasAnalysis AA, Function F,
584	const TargetTransformInfo *TTI)
585	: NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
586	DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr),
587	WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
588	}
589
590	/// This enum represents the kinds of reductions that we support.
591	enum ReductionKind {
592	RK_NoReduction, ///< Not a reduction.
593	RK_IntegerAdd, ///< Sum of integers.
594	RK_IntegerMult, ///< Product of integers.
595	RK_IntegerOr, ///< Bitwise or logical OR of numbers.
596	RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
597	RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
598	RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
599	RK_FloatAdd, ///< Sum of floats.
600	RK_FloatMult, ///< Product of floats.
601	RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()).
602	};
603
604	/// This enum represents the kinds of inductions that we support.
605	enum InductionKind {
606	IK_NoInduction, ///< Not an induction variable.
607	IK_IntInduction, ///< Integer induction variable. Step = 1.
608	IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
609	IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem).
610	IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
611	};
612
613	// This enum represents the kind of minmax reduction.
614	enum MinMaxReductionKind {
615	MRK_Invalid,
616	MRK_UIntMin,
617	MRK_UIntMax,
618	MRK_SIntMin,
619	MRK_SIntMax,
620	MRK_FloatMin,
621	MRK_FloatMax
622	};
623
624	/// This struct holds information about reduction variables.
625	struct ReductionDescriptor {
626	ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
627	Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
628
629	ReductionDescriptor(Value Start, Instruction Exit, ReductionKind K,
630	MinMaxReductionKind MK)
631	: StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
632
633	// The starting value of the reduction.
634	// It does not have to be zero!
635	TrackingVH<Value> StartValue;
636	// The instruction who's value is used outside the loop.
637	Instruction *LoopExitInstr;
638	// The kind of the reduction.
639	ReductionKind Kind;
640	// If this a min/max reduction the kind of reduction.
641	MinMaxReductionKind MinMaxKind;
642	};
643
644	/// This POD struct holds information about a potential reduction operation.
645	struct ReductionInstDesc {
646	ReductionInstDesc(bool IsRedux, Instruction *I) :
647	IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
648
649	ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
650	IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
651
652	// Is this instruction a reduction candidate.
653	bool IsReduction;
654	// The last instruction in a min/max pattern (select of the select(icmp())
655	// pattern), or the current reduction instruction otherwise.
656	Instruction *PatternLastInst;
657	// If this is a min/max pattern the comparison predicate.
658	MinMaxReductionKind MinMaxKind;
659	};
660
661	/// This struct holds information about the memory runtime legality
662	/// check that a group of pointers do not overlap.
663	struct RuntimePointerCheck {
664	RuntimePointerCheck() : Need(false) {}
665
666	/// Reset the state of the pointer runtime information.
667	void reset() {
668	Need = false;
669	Pointers.clear();
670	Starts.clear();
671	Ends.clear();
672	IsWritePtr.clear();
673	DependencySetId.clear();
674	AliasSetId.clear();
675	}
676
677	/// Insert a pointer and calculate the start and end SCEVs.
678	void insert(ScalarEvolution SE, Loop Lp, Value *Ptr, bool WritePtr,
679	unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
680
681	/// This flag indicates if we need to add the runtime check.
682	bool Need;
683	/// Holds the pointers that we need to check.
684	SmallVector<TrackingVH<Value>, 2> Pointers;
685	/// Holds the pointer value at the beginning of the loop.
686	SmallVector<const SCEV*, 2> Starts;
687	/// Holds the pointer value at the end of the loop.
688	SmallVector<const SCEV*, 2> Ends;
689	/// Holds the information if this pointer is used for writing to memory.
690	SmallVector<bool, 2> IsWritePtr;
691	/// Holds the id of the set of pointers that could be dependent because of a
692	/// shared underlying object.
693	SmallVector<unsigned, 2> DependencySetId;
694	/// Holds the id of the disjoint alias set to which this pointer belongs.
695	SmallVector<unsigned, 2> AliasSetId;
696	};
697
698	/// A struct for saving information about induction variables.
699	struct InductionInfo {
700	InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
701	InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
702	/// Start value.
703	TrackingVH<Value> StartValue;
704	/// Induction kind.
705	InductionKind IK;
706	};
707
708	/// ReductionList contains the reduction descriptors for all
709	/// of the reductions that were found in the loop.
710	typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
711
712	/// InductionList saves induction variables and maps them to the
713	/// induction descriptor.
714	typedef MapVector<PHINode*, InductionInfo> InductionList;
715
716	/// Returns true if it is legal to vectorize this loop.
717	/// This does not mean that it is profitable to vectorize this
718	/// loop, only that it is legal to do so.
719	bool canVectorize();
720
721	/// Returns the Induction variable.
722	PHINode *getInduction() { return Induction; }
723
724	/// Returns the reduction variables found in the loop.
725	ReductionList *getReductionVars() { return &Reductions; }
726
727	/// Returns the induction variables found in the loop.
728	InductionList *getInductionVars() { return &Inductions; }
729
730	/// Returns the widest induction type.
731	Type *getWidestInductionType() { return WidestIndTy; }
732
733	/// Returns True if V is an induction variable in this loop.
734	bool isInductionVariable(const Value *V);
735
736	/// Return true if the block BB needs to be predicated in order for the loop
737	/// to be vectorized.
738	bool blockNeedsPredication(BasicBlock *BB);
739
740	/// Check if this pointer is consecutive when vectorizing. This happens
741	/// when the last index of the GEP is the induction variable, or that the
742	/// pointer itself is an induction variable.
743	/// This check allows us to vectorize A[idx] into a wide load/store.
744	/// Returns:
745	/// 0 - Stride is unknown or non-consecutive.
746	/// 1 - Address is consecutive.
747	/// -1 - Address is consecutive, and decreasing.
748	int isConsecutivePtr(Value *Ptr);
749
750	/// Returns true if the value V is uniform within the loop.
751	bool isUniform(Value *V);
752
753	/// Returns true if this instruction will remain scalar after vectorization.
754	bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
755
756	/// Returns the information that we collected about runtime memory check.
757	RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
758
759	/// This function returns the identity element (or neutral element) for
760	/// the operation K.
761	static Constant getReductionIdentity(ReductionKind K, Type Tp);
762
763	unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
764
765	bool hasStride(Value *V) { return StrideSet.count(V); }
766	bool mustCheckStrides() { return !StrideSet.empty(); }
767	SmallPtrSet<Value *, 8>::iterator strides_begin() {
768	return StrideSet.begin();
769	}
770	SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
771
772	/// Returns true if the target machine supports masked store operation
773	/// for the given \p DataType and kind of access to \p Ptr.
774	bool isLegalMaskedStore(Type DataType, Value Ptr) {
775	return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
776	}
777	/// Returns true if the target machine supports masked load operation
778	/// for the given \p DataType and kind of access to \p Ptr.
779	bool isLegalMaskedLoad(Type DataType, Value Ptr) {
780	return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
781	}
782	/// Returns true if vector representation of the instruction \p I
783	/// requires mask.
784	bool isMaskRequired(const Instruction* I) {
785	return (MaskedOp.count(I) != 0);
786	}
787	private:
788	/// Check if a single basic block loop is vectorizable.
789	/// At this point we know that this is a loop with a constant trip count
790	/// and we only need to check individual instructions.
791	bool canVectorizeInstrs();
792
793	/// When we vectorize loops we may change the order in which
794	/// we read and write from memory. This method checks if it is
795	/// legal to vectorize the code, considering only memory constrains.
796	/// Returns true if the loop is vectorizable
797	bool canVectorizeMemory();
798
799	/// Return true if we can vectorize this loop using the IF-conversion
800	/// transformation.
801	bool canVectorizeWithIfConvert();
802
803	/// Collect the variables that need to stay uniform after vectorization.
804	void collectLoopUniforms();
805
806	/// Return true if all of the instructions in the block can be speculatively
807	/// executed. \p SafePtrs is a list of addresses that are known to be legal
808	/// and we know that we can read from them without segfault.
809	bool blockCanBePredicated(BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs);
810
811	/// Returns True, if 'Phi' is the kind of reduction variable for type
812	/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
813	bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
814	/// Returns a struct describing if the instruction 'I' can be a reduction
815	/// variable of type 'Kind'. If the reduction is a min/max pattern of
816	/// select(icmp()) this function advances the instruction pointer 'I' from the
817	/// compare instruction to the select instruction and stores this pointer in
818	/// 'PatternLastInst' member of the returned struct.
819	ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
820	ReductionInstDesc &Desc);
821	/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
822	/// pattern corresponding to a min(X, Y) or max(X, Y).
823	static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
824	ReductionInstDesc &Prev);
825	/// Returns the induction kind of Phi. This function may return NoInduction
826	/// if the PHI is not an induction variable.
827	InductionKind isInductionVariable(PHINode *Phi);
828
829	/// \brief Collect memory access with loop invariant strides.
830	///
831	/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
832	/// invariant.
833	void collectStridedAcccess(Value *LoadOrStoreInst);
834
835	/// Report an analysis message to assist the user in diagnosing loops that are
836	/// not vectorized.
837	void emitAnalysis(Report &Message) {
838	DebugLoc DL = TheLoop->getStartLoc();
839	if (Instruction *I = Message.getInstr())
840	DL = I->getDebugLoc();
841	emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize",
842	*TheFunction, DL, Message.str());
843	}
844
845	/// The loop that we evaluate.
846	Loop *TheLoop;
847	/// Scev analysis.
848	ScalarEvolution *SE;
849	/// DataLayout analysis.
850	const DataLayout *DL;
851	/// Dominators.
852	DominatorTree *DT;
853	/// Target Library Info.
854	TargetLibraryInfo *TLI;
855	/// Alias analysis.
856	AliasAnalysis *AA;
857	/// Parent function
858	Function *TheFunction;
859	/// Target Transform Info
860	const TargetTransformInfo *TTI;
861
862	// --- vectorization state --- //
863
864	/// Holds the integer induction variable. This is the counter of the
865	/// loop.
866	PHINode *Induction;
867	/// Holds the reduction variables.
868	ReductionList Reductions;
869	/// Holds all of the induction variables that we found in the loop.
870	/// Notice that inductions don't need to start at zero and that induction
871	/// variables can be pointers.
872	InductionList Inductions;
873	/// Holds the widest induction type encountered.
874	Type *WidestIndTy;
875
876	/// Allowed outside users. This holds the reduction
877	/// vars which can be accessed from outside the loop.
878	SmallPtrSet<Value*, 4> AllowedExit;
879	/// This set holds the variables which are known to be uniform after
880	/// vectorization.
881	SmallPtrSet<Instruction*, 4> Uniforms;
882	/// We need to check that all of the pointers in this list are disjoint
883	/// at runtime.
884	RuntimePointerCheck PtrRtCheck;
885	/// Can we assume the absence of NaNs.
886	bool HasFunNoNaNAttr;
887
888	unsigned MaxSafeDepDistBytes;
889
890	ValueToValueMap Strides;
891	SmallPtrSet<Value *, 8> StrideSet;
892
893	/// While vectorizing these instructions we have to generate a
894	/// call to the appropriate masked intrinsic
895	SmallPtrSet<const Instruction*, 8> MaskedOp;
896	};
897
898	/// LoopVectorizationCostModel - estimates the expected speedups due to
899	/// vectorization.
900	/// In many cases vectorization is not profitable. This can happen because of
901	/// a number of reasons. In this class we mainly attempt to predict the
902	/// expected speedup/slowdowns due to the supported instruction set. We use the
903	/// TargetTransformInfo to query the different backends for the cost of
904	/// different operations.
905	class LoopVectorizationCostModel {
906	public:
907	LoopVectorizationCostModel(Loop L, ScalarEvolution SE, LoopInfo *LI,
908	LoopVectorizationLegality *Legal,
909	const TargetTransformInfo &TTI,
910	const DataLayout DL, const TargetLibraryInfo TLI,
911	AssumptionTracker AT, const Function F,
912	const LoopVectorizeHints *Hints)
913	: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
914	TheFunction(F), Hints(Hints) {
915	CodeMetrics::collectEphemeralValues(L, AT, EphValues);
916	}
917
918	/// Information about vectorization costs
919	struct VectorizationFactor {
920	unsigned Width; // Vector width with best cost
921	unsigned Cost; // Cost of the loop with that width
922	};
923	/// \return The most profitable vectorization factor and the cost of that VF.
924	/// This method checks every power of two up to VF. If UserVF is not ZERO
925	/// then this vectorization factor will be selected if vectorization is
926	/// possible.
927	VectorizationFactor selectVectorizationFactor(bool OptForSize);
928
929	/// \return The size (in bits) of the widest type in the code that
930	/// needs to be vectorized. We ignore values that remain scalar such as
931	/// 64 bit loop indices.
932	unsigned getWidestType();
933
934	/// \return The most profitable unroll factor.
935	/// If UserUF is non-zero then this method finds the best unroll-factor
936	/// based on register pressure and other parameters.
937	/// VF and LoopCost are the selected vectorization factor and the cost of the
938	/// selected VF.
939	unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
940
941	/// \brief A struct that represents some properties of the register usage
942	/// of a loop.
943	struct RegisterUsage {
944	/// Holds the number of loop invariant values that are used in the loop.
945	unsigned LoopInvariantRegs;
946	/// Holds the maximum number of concurrent live intervals in the loop.
947	unsigned MaxLocalUsers;
948	/// Holds the number of instructions in the loop.
949	unsigned NumInstructions;
950	};
951
952	/// \return information about the register usage of the loop.
953	RegisterUsage calculateRegisterUsage();
954
955	private:
956	/// Returns the expected execution cost. The unit of the cost does
957	/// not matter because we use the 'cost' units to compare different
958	/// vector widths. The cost that is returned is not normalized by
959	/// the factor width.
960	unsigned expectedCost(unsigned VF);
961
962	/// Returns the execution time cost of an instruction for a given vector
963	/// width. Vector width of one means scalar.
964	unsigned getInstructionCost(Instruction *I, unsigned VF);
965
966	/// A helper function for converting Scalar types to vector types.
967	/// If the incoming type is void, we return void. If the VF is 1, we return
968	/// the scalar type.
969	static Type* ToVectorTy(Type *Scalar, unsigned VF);
970
971	/// Returns whether the instruction is a load or store and will be a emitted
972	/// as a vector operation.
973	bool isConsecutiveLoadOrStore(Instruction *I);
974
975	/// Report an analysis message to assist the user in diagnosing loops that are
976	/// not vectorized.
977	void emitAnalysis(Report &Message) {
978	DebugLoc DL = TheLoop->getStartLoc();
979	if (Instruction *I = Message.getInstr())
980	DL = I->getDebugLoc();
981	emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize",
982	*TheFunction, DL, Message.str());
983	}
984
985	/// Values used only by @llvm.assume calls.
986	SmallPtrSet<const Value *, 32> EphValues;
987
988	/// The loop that we evaluate.
989	Loop *TheLoop;
990	/// Scev analysis.
991	ScalarEvolution *SE;
992	/// Loop Info analysis.
993	LoopInfo *LI;
994	/// Vectorization legality.
995	LoopVectorizationLegality *Legal;
996	/// Vector target information.
997	const TargetTransformInfo &TTI;
998	/// Target data layout information.
999	const DataLayout *DL;
1000	/// Target Library Info.
1001	const TargetLibraryInfo *TLI;
1002	const Function *TheFunction;
1003	// Loop Vectorize Hint.
1004	const LoopVectorizeHints *Hints;
1005	};
1006
1007	/// Utility class for getting and setting loop vectorizer hints in the form
1008	/// of loop metadata.
1009	/// This class keeps a number of loop annotations locally (as member variables)
1010	/// and can, upon request, write them back as metadata on the loop. It will
1011	/// initially scan the loop for existing metadata, and will update the local
1012	/// values based on information in the loop.
1013	/// We cannot write all values to metadata, as the mere presence of some info,
1014	/// for example 'force', means a decision has been made. So, we need to be
1015	/// careful NOT to add them if the user hasn't specifically asked so.
1016	class LoopVectorizeHints {
1017	enum HintKind {
1018	HK_WIDTH,
1019	HK_UNROLL,
1020	HK_FORCE
1021	};
1022
1023	/// Hint - associates name and validation with the hint value.
1024	struct Hint {
1025	const char * Name;
1026	unsigned Value; // This may have to change for non-numeric values.
1027	HintKind Kind;
1028
1029	Hint(const char * Name, unsigned Value, HintKind Kind)
1030	: Name(Name), Value(Value), Kind(Kind) { }
1031
1032	bool validate(unsigned Val) {
1033	switch (Kind) {
1034	case HK_WIDTH:
1035	return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
1036	case HK_UNROLL:
1037	return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1038	case HK_FORCE:
1039	return (Val <= 1);
1040	}
1041	return false;
1042	}
1043	};
1044
1045	/// Vectorization width.
1046	Hint Width;
1047	/// Vectorization interleave factor.
1048	Hint Interleave;
1049	/// Vectorization forced
1050	Hint Force;
1051
1052	/// Return the loop metadata prefix.
1053	static StringRef Prefix() { return "llvm.loop."; }
1054
1055	public:
1056	enum ForceKind {
1057	FK_Undefined = -1, ///< Not selected.
1058	FK_Disabled = 0, ///< Forcing disabled.
1059	FK_Enabled = 1, ///< Forcing enabled.
1060	};
1061
1062	LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
1063	: Width("vectorize.width", VectorizationFactor, HK_WIDTH),
1064	Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1065	Force("vectorize.enable", FK_Undefined, HK_FORCE),
1066	TheLoop(L) {
1067	// Populate values with existing loop metadata.
1068	getHintsFromMetadata();
1069
1070	// force-vector-interleave overrides DisableInterleaving.
1071	if (VectorizationInterleave.getNumOccurrences() > 0)
1072	Interleave.Value = VectorizationInterleave;
1073
1074	DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0)
1075	<< "LV: Interleaving disabled by the pass manager\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0);
1076	}
1077
1078	/// Mark the loop L as already vectorized by setting the width to 1.
1079	void setAlreadyVectorized() {
1080	Width.Value = Interleave.Value = 1;
1081	Hint Hints[] = {Width, Interleave};
1082	writeHintsToMetadata(Hints);
1083	}
1084
1085	/// Dumps all the hint information.
1086	std::string emitRemark() const {
1087	Report R;
1088	if (Force.Value == LoopVectorizeHints::FK_Disabled)
1089	R << "vectorization is explicitly disabled";
1090	else {
1091	R << "use -Rpass-analysis=loop-vectorize for more info";
1092	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1093	R << " (Force=true";
1094	if (Width.Value != 0)
1095	R << ", Vector Width=" << Width.Value;
1096	if (Interleave.Value != 0)
1097	R << ", Interleave Count=" << Interleave.Value;
1098	R << ")";
1099	}
1100	}
1101
1102	return R.str();
1103	}
1104
1105	unsigned getWidth() const { return Width.Value; }
1106	unsigned getInterleave() const { return Interleave.Value; }
1107	enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1108
1109	private:
1110	/// Find hints specified in the loop metadata and update local values.
1111	void getHintsFromMetadata() {
1112	MDNode *LoopID = TheLoop->getLoopID();
1113	if (!LoopID)
1114	return;
1115
1116	// First operand should refer to the loop id itself.
1117	assert(LoopID->getNumOperands() > 0 && "requires at least one operand")((LoopID->getNumOperands() > 0 && "requires at least one operand" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getNumOperands() > 0 && \"requires at least one operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1117, __PRETTY_FUNCTION__));
1118	assert(LoopID->getOperand(0) == LoopID && "invalid loop id")((LoopID->getOperand(0) == LoopID && "invalid loop id" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getOperand(0) == LoopID && \"invalid loop id\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1118, __PRETTY_FUNCTION__));
1119
1120	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1121	const MDString *S = nullptr;
1122	SmallVector<Metadata *, 4> Args;
1123
1124	// The expected hint is either a MDString or a MDNode with the first
1125	// operand a MDString.
1126	if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1127	if (!MD \|\| MD->getNumOperands() == 0)
1128	continue;
1129	S = dyn_cast<MDString>(MD->getOperand(0));
1130	for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1131	Args.push_back(MD->getOperand(i));
1132	} else {
1133	S = dyn_cast<MDString>(LoopID->getOperand(i));
1134	assert(Args.size() == 0 && "too many arguments for MDString")((Args.size() == 0 && "too many arguments for MDString" ) ? static_cast<void> (0) : __assert_fail ("Args.size() == 0 && \"too many arguments for MDString\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1134, __PRETTY_FUNCTION__));
1135	}
1136
1137	if (!S)
1138	continue;
1139
1140	// Check if the hint starts with the loop metadata prefix.
1141	StringRef Name = S->getString();
1142	if (Args.size() == 1)
1143	setHint(Name, Args[0]);
1144	}
1145	}
1146
1147	/// Checks string hint with one operand and set value if valid.
1148	void setHint(StringRef Name, Metadata *Arg) {
1149	if (!Name.startswith(Prefix()))
1150	return;
1151	Name = Name.substr(Prefix().size(), StringRef::npos);
1152
1153	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1154	if (!C) return;
1155	unsigned Val = C->getZExtValue();
1156
1157	Hint *Hints[] = {&Width, &Interleave, &Force};
1158	for (auto H : Hints) {
1159	if (Name == H->Name) {
1160	if (H->validate(Val))
1161	H->Value = Val;
1162	else
1163	DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"; } } while (0);
1164	break;
1165	}
1166	}
1167	}
1168
1169	/// Create a new hint from name / value pair.
1170	MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1171	LLVMContext &Context = TheLoop->getHeader()->getContext();
1172	Metadata *MDs[] = {MDString::get(Context, Name),
1173	ConstantAsMetadata::get(
1174	ConstantInt::get(Type::getInt32Ty(Context), V))};
1175	return MDNode::get(Context, MDs);
1176	}
1177
1178	/// Matches metadata with hint name.
1179	bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1180	MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
1181	if (!Name)
1182	return false;
1183
1184	for (auto H : HintTypes)
1185	if (Name->getString().endswith(H.Name))
1186	return true;
1187	return false;
1188	}
1189
1190	/// Sets current hints into loop metadata, keeping other values intact.
1191	void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1192	if (HintTypes.size() == 0)
1193	return;
1194
1195	// Reserve the first element to LoopID (see below).
1196	SmallVector<Metadata *, 4> MDs(1);
1197	// If the loop already has metadata, then ignore the existing operands.
1198	MDNode *LoopID = TheLoop->getLoopID();
1199	if (LoopID) {
1200	for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1201	MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1202	// If node in update list, ignore old value.
1203	if (!matchesHintMetadataName(Node, HintTypes))
1204	MDs.push_back(Node);
1205	}
1206	}
1207
1208	// Now, add the missing hints.
1209	for (auto H : HintTypes)
1210	MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1211
1212	// Replace current metadata node with new one.
1213	LLVMContext &Context = TheLoop->getHeader()->getContext();
1214	MDNode *NewLoopID = MDNode::get(Context, MDs);
1215	// Set operand 0 to refer to the loop id itself.
1216	NewLoopID->replaceOperandWith(0, NewLoopID);
1217
1218	TheLoop->setLoopID(NewLoopID);
1219	LoopID = NewLoopID;
	Value stored to 'LoopID' is never read
1220	}
1221
1222	/// The loop these hints belong to.
1223	const Loop *TheLoop;
1224	};
1225
1226	static void emitMissedWarning(Function F, Loop L,
1227	const LoopVectorizeHints &LH) {
1228	emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1229	L->getStartLoc(), LH.emitRemark());
1230
1231	if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1232	if (LH.getWidth() != 1)
1233	emitLoopVectorizeWarning(
1234	F->getContext(), *F, L->getStartLoc(),
1235	"failed explicitly specified loop vectorization");
1236	else if (LH.getInterleave() != 1)
1237	emitLoopInterleaveWarning(
1238	F->getContext(), *F, L->getStartLoc(),
1239	"failed explicitly specified loop interleaving");
1240	}
1241	}
1242
1243	static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
1244	if (L.empty())
1245	return V.push_back(&L);
1246
1247	for (Loop *InnerL : L)
1248	addInnerLoop(*InnerL, V);
1249	}
1250
1251	/// The LoopVectorize Pass.
1252	struct LoopVectorize : public FunctionPass {
1253	/// Pass identification, replacement for typeid
1254	static char ID;
1255
1256	explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1257	: FunctionPass(ID),
1258	DisableUnrolling(NoUnrolling),
1259	AlwaysVectorize(AlwaysVectorize) {
1260	initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1261	}
1262
1263	ScalarEvolution *SE;
1264	const DataLayout *DL;
1265	LoopInfo *LI;
1266	TargetTransformInfo *TTI;
1267	DominatorTree *DT;
1268	BlockFrequencyInfo *BFI;
1269	TargetLibraryInfo *TLI;
1270	AliasAnalysis *AA;
1271	AssumptionTracker *AT;
1272	bool DisableUnrolling;
1273	bool AlwaysVectorize;
1274
1275	BlockFrequency ColdEntryFreq;
1276
1277	bool runOnFunction(Function &F) override {
1278	SE = &getAnalysis<ScalarEvolution>();
1279	DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
1280	DL = DLP ? &DLP->getDataLayout() : nullptr;
1281	LI = &getAnalysis<LoopInfo>();
1282	TTI = &getAnalysis<TargetTransformInfo>();
1283	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1284	BFI = &getAnalysis<BlockFrequencyInfo>();
1285	TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
1286	AA = &getAnalysis<AliasAnalysis>();
1287	AT = &getAnalysis<AssumptionTracker>();
1288
1289	// Compute some weights outside of the loop over the loops. Compute this
1290	// using a BranchProbability to re-use its scaling math.
1291	const BranchProbability ColdProb(1, 5); // 20%
1292	ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
1293
1294	// If the target claims to have no vector registers don't attempt
1295	// vectorization.
1296	if (!TTI->getNumberOfRegisters(true))
1297	return false;
1298
1299	if (!DL) {
1300	DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0)
1301	<< ": Missing data layout\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0);
1302	return false;
1303	}
1304
1305	// Build up a worklist of inner-loops to vectorize. This is necessary as
1306	// the act of vectorizing or partially unrolling a loop creates new loops
1307	// and can invalidate iterators across the loops.
1308	SmallVector<Loop *, 8> Worklist;
1309
1310	for (Loop L : LI)
1311	addInnerLoop(*L, Worklist);
1312
1313	LoopsAnalyzed += Worklist.size();
1314
1315	// Now walk the identified inner loops.
1316	bool Changed = false;
1317	while (!Worklist.empty())
1318	Changed \|= processLoop(Worklist.pop_back_val());
1319
1320	// Process each loop nest in the function.
1321	return Changed;
1322	}
1323
1324	bool processLoop(Loop *L) {
1325	assert(L->empty() && "Only process inner loops.")((L->empty() && "Only process inner loops.") ? static_cast <void> (0) : __assert_fail ("L->empty() && \"Only process inner loops.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1325, __PRETTY_FUNCTION__));
1326
1327	#ifndef NDEBUG
1328	const std::string DebugLocStr = getDebugLocString(L);
1329	#endif /* NDEBUG */
1330
1331	DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0)
1332	<< L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0)
1333	<< DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0);
1334
1335	LoopVectorizeHints Hints(L, DisableUnrolling);
1336
1337	DEBUG(dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1338	<< " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1339	<< (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1340	? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1341	: (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1342	? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1343	: "?")) << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0)
1344	<< " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0);
1345
1346	// Function containing loop
1347	Function *F = L->getHeader()->getParent();
1348
1349	// Looking at the diagnostic output is the only way to determine if a loop
1350	// was vectorized (other than looking at the IR or machine code), so it
1351	// is important to generate an optimization remark for each loop. Most of
1352	// these messages are generated by emitOptimizationRemarkAnalysis. Remarks
1353	// generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
1354	// less verbose reporting vectorized loops and unvectorized loops that may
1355	// benefit from vectorization, respectively.
1356
1357	if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
1358	DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n" ; } } while (0);
1359	emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1360	L->getStartLoc(), Hints.emitRemark());
1361	return false;
1362	}
1363
1364	if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
1365	DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n" ; } } while (0);
1366	emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,
1367	L->getStartLoc(), Hints.emitRemark());
1368	return false;
1369	}
1370
1371	if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
1372	DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n" ; } } while (0);
1373	emitOptimizationRemarkAnalysis(
1374	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1375	"loop not vectorized: vector width and interleave count are "
1376	"explicitly set to 1");
1377	return false;
1378	}
1379
1380	// Check the loop for a trip count threshold:
1381	// do not vectorize loops with a tiny trip count.
1382	const unsigned TC = SE->getSmallConstantTripCount(L);
1383	if (TC > 0u && TC < TinyTripCountVectorThreshold) {
1384	DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 )
1385	<< "This loop is not worth vectorizing.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 );
1386	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1387	DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n" ; } } while (0);
1388	else {
1389	DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\n"; } } while (0);
1390	emitOptimizationRemarkAnalysis(
1391	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1392	"vectorization is not beneficial and is not explicitly forced");
1393	return false;
1394	}
1395	}
1396
1397	// Check if it is legal to vectorize the loop.
1398	LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI);
1399	if (!LVL.canVectorize()) {
1400	DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ; } } while (0);
1401	emitMissedWarning(F, L, Hints);
1402	return false;
1403	}
1404
1405	// Use the cost model.
1406	LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F,
1407	&Hints);
1408
1409	// Check the function attributes to find out if this function should be
1410	// optimized for size.
1411	bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1412	F->hasFnAttribute(Attribute::OptimizeForSize);
1413
1414	// Compute the weighted frequency of this loop being executed and see if it
1415	// is less than 20% of the function entry baseline frequency. Note that we
1416	// always have a canonical loop here because we think we can vectoriez.
1417	// FIXME: This is hidden behind a flag due to pervasive problems with
1418	// exactly what block frequency models.
1419	if (LoopVectorizeWithBlockFrequency) {
1420	BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
1421	if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1422	LoopEntryFreq < ColdEntryFreq)
1423	OptForSize = true;
1424	}
1425
1426	// Check the function attributes to see if implicit floats are allowed.a
1427	// FIXME: This check doesn't seem possibly correct -- what if the loop is
1428	// an integer loop and the vector instructions selected are purely integer
1429	// vector instructions?
1430	if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1431	DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0)
1432	"attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0);
1433	emitOptimizationRemarkAnalysis(
1434	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1435	"loop not vectorized due to NoImplicitFloat attribute");
1436	emitMissedWarning(F, L, Hints);
1437	return false;
1438	}
1439
1440	// Select the optimal vectorization factor.
1441	const LoopVectorizationCostModel::VectorizationFactor VF =
1442	CM.selectVectorizationFactor(OptForSize);
1443
1444	// Select the unroll factor.
1445	const unsigned UF =
1446	CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
1447
1448	DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0)
1449	<< DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0);
1450	DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unroll Factor is " << UF << '\n'; } } while (0);
1451
1452	if (VF.Width == 1) {
1453	DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial\n" ; } } while (0);
1454
1455	if (UF == 1) {
1456	emitOptimizationRemarkAnalysis(
1457	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1458	"not beneficial to vectorize and user disabled interleaving");
1459	return false;
1460	}
1461	DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Trying to at least unroll the loops.\n" ; } } while (0);
1462
1463	// Report the unrolling decision.
1464	emitOptimizationRemark(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1465	Twine("unrolled with interleaving factor " +
1466	Twine(UF) +
1467	" (vectorization not beneficial)"));
1468
1469	// We decided not to vectorize, but we may want to unroll.
1470
1471	InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
1472	Unroller.vectorize(&LVL);
1473	} else {
1474	// If we decided that it is legal to vectorize the loop then do it.
1475	InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
1476	LB.vectorize(&LVL);
1477	++LoopsVectorized;
1478
1479	// Report the vectorization decision.
1480	emitOptimizationRemark(
1481	F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),
1482	Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
1483	", unrolling interleave factor: " + Twine(UF) + ")");
1484	}
1485
1486	// Mark the loop as already vectorized to avoid vectorizing again.
1487	Hints.setAlreadyVectorized();
1488
1489	DEBUG(verifyFunction(L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { verifyFunction(L->getHeader()->getParent ()); } } while (0);
1490	return true;
1491	}
1492
1493	void getAnalysisUsage(AnalysisUsage &AU) const override {
1494	AU.addRequired<AssumptionTracker>();
1495	AU.addRequiredID(LoopSimplifyID);
1496	AU.addRequiredID(LCSSAID);
1497	AU.addRequired<BlockFrequencyInfo>();
1498	AU.addRequired<DominatorTreeWrapperPass>();
1499	AU.addRequired<LoopInfo>();
1500	AU.addRequired<ScalarEvolution>();
1501	AU.addRequired<TargetTransformInfo>();
1502	AU.addRequired<AliasAnalysis>();
1503	AU.addPreserved<LoopInfo>();
1504	AU.addPreserved<DominatorTreeWrapperPass>();
1505	AU.addPreserved<AliasAnalysis>();
1506	}
1507
1508	};
1509
1510	} // end anonymous namespace
1511
1512	//===----------------------------------------------------------------------===//
1513	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1514	// LoopVectorizationCostModel.
1515	//===----------------------------------------------------------------------===//
1516
1517	static Value stripIntegerCast(Value V) {
1518	if (CastInst *CI = dyn_cast<CastInst>(V))
1519	if (CI->getOperand(0)->getType()->isIntegerTy())
1520	return CI->getOperand(0);
1521	return V;
1522	}
1523
1524	///\brief Replaces the symbolic stride in a pointer SCEV expression by one.
1525	///
1526	/// If \p OrigPtr is not null, use it to look up the stride value instead of
1527	/// \p Ptr.
1528	static const SCEV replaceSymbolicStrideSCEV(ScalarEvolution SE,
1529	ValueToValueMap &PtrToStride,
1530	Value Ptr, Value OrigPtr = nullptr) {
1531
1532	const SCEV *OrigSCEV = SE->getSCEV(Ptr);
1533
1534	// If there is an entry in the map return the SCEV of the pointer with the
1535	// symbolic stride replaced by one.
1536	ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
1537	if (SI != PtrToStride.end()) {
1538	Value *StrideVal = SI->second;
1539
1540	// Strip casts.
1541	StrideVal = stripIntegerCast(StrideVal);
1542
1543	// Replace symbolic stride by one.
1544	Value *One = ConstantInt::get(StrideVal->getType(), 1);
1545	ValueToValueMap RewriteMap;
1546	RewriteMap[StrideVal] = One;
1547
1548	const SCEV *ByOne =
1549	SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
1550	DEBUG(dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOnedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOne << "\n"; } } while (0)
1551	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOne << "\n"; } } while (0);
1552	return ByOne;
1553	}
1554
1555	// Otherwise, just return the SCEV of the original pointer.
1556	return SE->getSCEV(Ptr);
1557	}
1558
1559	void LoopVectorizationLegality::RuntimePointerCheck::insert(
1560	ScalarEvolution SE, Loop Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
1561	unsigned ASId, ValueToValueMap &Strides) {
1562	// Get the stride replaced scev.
1563	const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
1564	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
1565	assert(AR && "Invalid addrec expression")((AR && "Invalid addrec expression") ? static_cast< void> (0) : __assert_fail ("AR && \"Invalid addrec expression\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1565, __PRETTY_FUNCTION__));
1566	const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
1567	const SCEV ScEnd = AR->evaluateAtIteration(Ex, SE);
1568	Pointers.push_back(Ptr);
1569	Starts.push_back(AR->getStart());
1570	Ends.push_back(ScEnd);
1571	IsWritePtr.push_back(WritePtr);
1572	DependencySetId.push_back(DepSetId);
1573	AliasSetId.push_back(ASId);
1574	}
1575
1576	Value InnerLoopVectorizer::getBroadcastInstrs(Value V) {
1577	// We need to place the broadcast of invariant variables outside the loop.
1578	Instruction *Instr = dyn_cast<Instruction>(V);
1579	bool NewInstr =
1580	(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
1581	Instr->getParent()) != LoopVectorBody.end());
1582	bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1583
1584	// Place the code for broadcasting invariant variables in the new preheader.
1585	IRBuilder<>::InsertPointGuard Guard(Builder);
1586	if (Invariant)
1587	Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1588
1589	// Broadcast the scalar into all locations in the vector.
1590	Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1591
1592	return Shuf;
1593	}
1594
1595	Value InnerLoopVectorizer::getConsecutiveVector(Value Val, int StartIdx,
1596	bool Negate) {
1597	assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector" ) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1597, __PRETTY_FUNCTION__));
1598	assert(Val->getType()->getScalarType()->isIntegerTy() &&((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1599, __PRETTY_FUNCTION__))
1599	"Elem must be an integer")((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1599, __PRETTY_FUNCTION__));
1600	// Create the types.
1601	Type *ITy = Val->getType()->getScalarType();
1602	VectorType *Ty = cast<VectorType>(Val->getType());
1603	int VLen = Ty->getNumElements();
1604	SmallVector<Constant*, 8> Indices;
1605
1606	// Create a vector of consecutive numbers from zero to VF.
1607	for (int i = 0; i < VLen; ++i) {
1608	int64_t Idx = Negate ? (-i) : i;
1609	Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
1610	}
1611
1612	// Add the consecutive indices to the vector value.
1613	Constant *Cv = ConstantVector::get(Indices);
1614	assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec" ) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1614, __PRETTY_FUNCTION__));
1615	return Builder.CreateAdd(Val, Cv, "induction");
1616	}
1617
1618	/// \brief Find the operand of the GEP that should be checked for consecutive
1619	/// stores. This ignores trailing indices that have no effect on the final
1620	/// pointer.
1621	static unsigned getGEPInductionOperand(const DataLayout *DL,
1622	const GetElementPtrInst *Gep) {
1623	unsigned LastOperand = Gep->getNumOperands() - 1;
1624	unsigned GEPAllocSize = DL->getTypeAllocSize(
1625	cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
1626
1627	// Walk backwards and try to peel off zeros.
1628	while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
1629	// Find the type we're currently indexing into.
1630	gep_type_iterator GEPTI = gep_type_begin(Gep);
1631	std::advance(GEPTI, LastOperand - 1);
1632
1633	// If it's a type with the same allocation size as the result of the GEP we
1634	// can peel off the zero index.
1635	if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
1636	break;
1637	--LastOperand;
1638	}
1639
1640	return LastOperand;
1641	}
1642
1643	int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
1644	assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr")((Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1644, __PRETTY_FUNCTION__));
1645	// Make sure that the pointer does not point to structs.
1646	if (Ptr->getType()->getPointerElementType()->isAggregateType())
1647	return 0;
1648
1649	// If this value is a pointer induction variable we know it is consecutive.
1650	PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1651	if (Phi && Inductions.count(Phi)) {
1652	InductionInfo II = Inductions[Phi];
1653	if (IK_PtrInduction == II.IK)
1654	return 1;
1655	else if (IK_ReversePtrInduction == II.IK)
1656	return -1;
1657	}
1658
1659	GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
1660	if (!Gep)
1661	return 0;
1662
1663	unsigned NumOperands = Gep->getNumOperands();
1664	Value *GpPtr = Gep->getPointerOperand();
1665	// If this GEP value is a consecutive pointer induction variable and all of
1666	// the indices are constant then we know it is consecutive. We can
1667	Phi = dyn_cast<PHINode>(GpPtr);
1668	if (Phi && Inductions.count(Phi)) {
1669
1670	// Make sure that the pointer does not point to structs.
1671	PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
1672	if (GepPtrType->getElementType()->isAggregateType())
1673	return 0;
1674
1675	// Make sure that all of the index operands are loop invariant.
1676	for (unsigned i = 1; i < NumOperands; ++i)
1677	if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1678	return 0;
1679
1680	InductionInfo II = Inductions[Phi];
1681	if (IK_PtrInduction == II.IK)
1682	return 1;
1683	else if (IK_ReversePtrInduction == II.IK)
1684	return -1;
1685	}
1686
1687	unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
1688
1689	// Check that all of the gep indices are uniform except for our induction
1690	// operand.
1691	for (unsigned i = 0; i != NumOperands; ++i)
1692	if (i != InductionOperand &&
1693	!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1694	return 0;
1695
1696	// We can emit wide load/stores only if the last non-zero index is the
1697	// induction variable.
1698	const SCEV *Last = nullptr;
1699	if (!Strides.count(Gep))
1700	Last = SE->getSCEV(Gep->getOperand(InductionOperand));
1701	else {
1702	// Because of the multiplication by a stride we can have a s/zext cast.
1703	// We are going to replace this stride by 1 so the cast is safe to ignore.
1704	//
1705	// %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1706	// %0 = trunc i64 %indvars.iv to i32
1707	// %mul = mul i32 %0, %Stride1
1708	// %idxprom = zext i32 %mul to i64 << Safe cast.
1709	// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
1710	//
1711	Last = replaceSymbolicStrideSCEV(SE, Strides,
1712	Gep->getOperand(InductionOperand), Gep);
1713	if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
1714	Last =
1715	(C->getSCEVType() == scSignExtend \|\| C->getSCEVType() == scZeroExtend)
1716	? C->getOperand()
1717	: Last;
1718	}
1719	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
1720	const SCEV Step = AR->getStepRecurrence(SE);
1721
1722	// The memory is consecutive because the last index is consecutive
1723	// and all other indices are loop invariant.
1724	if (Step->isOne())
1725	return 1;
1726	if (Step->isAllOnesValue())
1727	return -1;
1728	}
1729
1730	return 0;
1731	}
1732
1733	bool LoopVectorizationLegality::isUniform(Value *V) {
1734	return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
1735	}
1736
1737	InnerLoopVectorizer::VectorParts&
1738	InnerLoopVectorizer::getVectorValue(Value *V) {
1739	assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used." ) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1739, __PRETTY_FUNCTION__));
1740	assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector" ) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1740, __PRETTY_FUNCTION__));
1741
1742	// If we have a stride that is replaced by one, do it here.
1743	if (Legal->hasStride(V))
1744	V = ConstantInt::get(V->getType(), 1);
1745
1746	// If we have this scalar in the map, return it.
1747	if (WidenMap.has(V))
1748	return WidenMap.get(V);
1749
1750	// If this scalar is unknown, assume that it is a constant or that it is
1751	// loop invariant. Broadcast V and save the value for future uses.
1752	Value *B = getBroadcastInstrs(V);
1753	return WidenMap.splat(V, B);
1754	}
1755
1756	Value InnerLoopVectorizer::reverseVector(Value Vec) {
1757	assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1757, __PRETTY_FUNCTION__));
1758	SmallVector<Constant*, 8> ShuffleMask;
1759	for (unsigned i = 0; i < VF; ++i)
1760	ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1761
1762	return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1763	ConstantVector::get(ShuffleMask),
1764	"reverse");
1765	}
1766
1767	void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
1768	// Attempt to issue a wide load.
1769	LoadInst *LI = dyn_cast<LoadInst>(Instr);
1770	StoreInst *SI = dyn_cast<StoreInst>(Instr);
1771
1772	assert((LI \|\| SI) && "Invalid Load/Store instruction")(((LI \|\| SI) && "Invalid Load/Store instruction") ? static_cast <void> (0) : __assert_fail ("(LI \|\| SI) && \"Invalid Load/Store instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1772, __PRETTY_FUNCTION__));
1773
1774	Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
1775	Type *DataTy = VectorType::get(ScalarDataTy, VF);
1776	Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
1777	unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
1778	// An alignment of 0 means target abi alignment. We need to use the scalar's
1779	// target abi alignment in such a case.
1780	if (!Alignment)
1781	Alignment = DL->getABITypeAlignment(ScalarDataTy);
1782	unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
1783	unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
1784	unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
1785
1786	if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
1787	!Legal->isMaskRequired(SI))
1788	return scalarizeInstruction(Instr, true);
1789
1790	if (ScalarAllocatedSize != VectorElementSize)
1791	return scalarizeInstruction(Instr);
1792
1793	// If the pointer is loop invariant or if it is non-consecutive,
1794	// scalarize the load.
1795	int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
1796	bool Reverse = ConsecutiveStride < 0;
1797	bool UniformLoad = LI && Legal->isUniform(Ptr);
1798	if (!ConsecutiveStride \|\| UniformLoad)
1799	return scalarizeInstruction(Instr);
1800
1801	Constant *Zero = Builder.getInt32(0);
1802	VectorParts &Entry = WidenMap.get(Instr);
1803
1804	// Handle consecutive loads/stores.
1805	GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1806	if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
1807	setDebugLocFromInst(Builder, Gep);
1808	Value *PtrOperand = Gep->getPointerOperand();
1809	Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
1810	FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
1811
1812	// Create the new GEP with the new induction variable.
1813	GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1814	Gep2->setOperand(0, FirstBasePtr);
1815	Gep2->setName("gep.indvar.base");
1816	Ptr = Builder.Insert(Gep2);
1817	} else if (Gep) {
1818	setDebugLocFromInst(Builder, Gep);
1819	assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1820, __PRETTY_FUNCTION__))
1820	OrigLoop) && "Base ptr must be invariant")((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1820, __PRETTY_FUNCTION__));
1821
1822	// The last index does not have to be the induction. It can be
1823	// consecutive and be a function of the index. For example A[I+1];
1824	unsigned NumOperands = Gep->getNumOperands();
1825	unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
1826	// Create the new GEP with the new induction variable.
1827	GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1828
1829	for (unsigned i = 0; i < NumOperands; ++i) {
1830	Value *GepOperand = Gep->getOperand(i);
1831	Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
1832
1833	// Update last index or loop invariant instruction anchored in loop.
1834	if (i == InductionOperand \|\|
1835	(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
1836	assert((i == InductionOperand \|\|(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__))
1837	SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__))
1838	"Must be last index or loop invariant")(((i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand \|\| SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__));
1839
1840	VectorParts &GEPParts = getVectorValue(GepOperand);
1841	Value *Index = GEPParts[0];
1842	Index = Builder.CreateExtractElement(Index, Zero);
1843	Gep2->setOperand(i, Index);
1844	Gep2->setName("gep.indvar.idx");
1845	}
1846	}
1847	Ptr = Builder.Insert(Gep2);
1848	} else {
1849	// Use the induction element ptr.
1850	assert(isa<PHINode>(Ptr) && "Invalid induction ptr")((isa<PHINode>(Ptr) && "Invalid induction ptr") ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(Ptr) && \"Invalid induction ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1850, __PRETTY_FUNCTION__));
1851	setDebugLocFromInst(Builder, Ptr);
1852	VectorParts &PtrVal = getVectorValue(Ptr);
1853	Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
1854	}
1855
1856	// Handle Stores:
1857	if (SI) {
1858	assert(!Legal->isUniform(SI->getPointerOperand()) &&((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1859, __PRETTY_FUNCTION__))
1859	"We do not allow storing to uniform addresses")((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1859, __PRETTY_FUNCTION__));
1860	setDebugLocFromInst(Builder, SI);
1861	// We don't want to update the value in the map as it might be used in
1862	// another expression. So don't use a reference type for "StoredVal".
1863	VectorParts StoredVal = getVectorValue(SI->getValueOperand());
1864
1865	for (unsigned Part = 0; Part < UF; ++Part) {
1866	// Calculate the pointer for the specific unroll-part.
1867	Value PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part VF));
1868
1869	if (Reverse) {
1870	// If we store to reverse consecutive memory locations then we need
1871	// to reverse the order of elements in the stored value.
1872	StoredVal[Part] = reverseVector(StoredVal[Part]);
1873	// If the address is consecutive but reversed, then the
1874	// wide store needs to start at the last vector element.
1875	PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1876	PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1877	}
1878
1879	Value *VecPtr = Builder.CreateBitCast(PartPtr,
1880	DataTy->getPointerTo(AddressSpace));
1881
1882	Instruction *NewSI;
1883	if (Legal->isMaskRequired(SI)) {
1884	Type *I8PtrTy =
1885	Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
1886
1887	Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
1888
1889	VectorParts Cond = createBlockInMask(SI->getParent());
1890	SmallVector <Value *, 8> Ops;
1891	Ops.push_back(I8Ptr);
1892	Ops.push_back(StoredVal[Part]);
1893	Ops.push_back(Builder.getInt32(Alignment));
1894	Ops.push_back(Cond[Part]);
1895	NewSI = Builder.CreateMaskedStore(Ops);
1896	}
1897	else
1898	NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
1899	propagateMetadata(NewSI, SI);
1900	}
1901	return;
1902	}
1903
1904	// Handle loads.
1905	assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast <void> (0) : __assert_fail ("LI && \"Must have a load instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1905, __PRETTY_FUNCTION__));
1906	setDebugLocFromInst(Builder, LI);
1907	for (unsigned Part = 0; Part < UF; ++Part) {
1908	// Calculate the pointer for the specific unroll-part.
1909	Value PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part VF));
1910
1911	if (Reverse) {
1912	// If the address is consecutive but reversed, then the
1913	// wide load needs to start at the last vector element.
1914	PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1915	PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1916	}
1917
1918	Instruction* NewLI;
1919	if (Legal->isMaskRequired(LI)) {
1920	Type *I8PtrTy =
1921	Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
1922
1923	Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
1924
1925	VectorParts SrcMask = createBlockInMask(LI->getParent());
1926	SmallVector <Value *, 8> Ops;
1927	Ops.push_back(I8Ptr);
1928	Ops.push_back(UndefValue::get(DataTy));
1929	Ops.push_back(Builder.getInt32(Alignment));
1930	Ops.push_back(SrcMask[Part]);
1931	NewLI = Builder.CreateMaskedLoad(Ops);
1932	}
1933	else {
1934	Value *VecPtr = Builder.CreateBitCast(PartPtr,
1935	DataTy->getPointerTo(AddressSpace));
1936	NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
1937	}
1938	propagateMetadata(NewLI, LI);
1939	Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
1940	}
1941	}
1942
1943	void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
1944	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1944, __PRETTY_FUNCTION__));
1945	// Holds vector parameters or scalars, in case of uniform vals.
1946	SmallVector<VectorParts, 4> Params;
1947
1948	setDebugLocFromInst(Builder, Instr);
1949
1950	// Find all of the vectorized parameters.
1951	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1952	Value *SrcOp = Instr->getOperand(op);
1953
1954	// If we are accessing the old induction variable, use the new one.
1955	if (SrcOp == OldInduction) {
1956	Params.push_back(getVectorValue(SrcOp));
1957	continue;
1958	}
1959
1960	// Try using previously calculated values.
1961	Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
1962
1963	// If the src is an instruction that appeared earlier in the basic block
1964	// then it should already be vectorized.
1965	if (SrcInst && OrigLoop->contains(SrcInst)) {
1966	assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1966, __PRETTY_FUNCTION__));
1967	// The parameter is a vector value from earlier.
1968	Params.push_back(WidenMap.get(SrcInst));
1969	} else {
1970	// The parameter is a scalar from outside the loop. Maybe even a constant.
1971	VectorParts Scalars;
1972	Scalars.append(UF, SrcOp);
1973	Params.push_back(Scalars);
1974	}
1975	}
1976
1977	assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1978, __PRETTY_FUNCTION__))
1978	"Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1978, __PRETTY_FUNCTION__));
1979
1980	// Does this instruction return a value ?
1981	bool IsVoidRetTy = Instr->getType()->isVoidTy();
1982
1983	Value *UndefVec = IsVoidRetTy ? nullptr :
1984	UndefValue::get(VectorType::get(Instr->getType(), VF));
1985	// Create a new entry in the WidenMap and initialize it to Undef or Null.
1986	VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
1987
1988	Instruction *InsertPt = Builder.GetInsertPoint();
1989	BasicBlock *IfBlock = Builder.GetInsertBlock();
1990	BasicBlock *CondBlock = nullptr;
1991
1992	VectorParts Cond;
1993	Loop *VectorLp = nullptr;
1994	if (IfPredicateStore) {
1995	assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1996, __PRETTY_FUNCTION__))
1996	"Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1996, __PRETTY_FUNCTION__));
1997	Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
1998	Instr->getParent());
1999	VectorLp = LI->getLoopFor(IfBlock);
2000	assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2000, __PRETTY_FUNCTION__));
2001	}
2002
2003	// For each vector unroll 'part':
2004	for (unsigned Part = 0; Part < UF; ++Part) {
2005	// For each scalar that we create:
2006	for (unsigned Width = 0; Width < VF; ++Width) {
2007
2008	// Start if-block.
2009	Value *Cmp = nullptr;
2010	if (IfPredicateStore) {
2011	Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
2012	Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
2013	CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
2014	LoopVectorBody.push_back(CondBlock);
2015	VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
2016	// Update Builder with newly created basic block.
2017	Builder.SetInsertPoint(InsertPt);
2018	}
2019
2020	Instruction *Cloned = Instr->clone();
2021	if (!IsVoidRetTy)
2022	Cloned->setName(Instr->getName() + ".cloned");
2023	// Replace the operands of the cloned instructions with extracted scalars.
2024	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2025	Value *Op = Params[op][Part];
2026	// Param is a vector. Need to extract the right lane.
2027	if (Op->getType()->isVectorTy())
2028	Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
2029	Cloned->setOperand(op, Op);
2030	}
2031
2032	// Place the cloned scalar in the new loop.
2033	Builder.Insert(Cloned);
2034
2035	// If the original scalar returns a value we need to place it in a vector
2036	// so that future users will be able to use it.
2037	if (!IsVoidRetTy)
2038	VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
2039	Builder.getInt32(Width));
2040	// End if-block.
2041	if (IfPredicateStore) {
2042	BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
2043	LoopVectorBody.push_back(NewIfBlock);
2044	VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
2045	Builder.SetInsertPoint(InsertPt);
2046	Instruction *OldBr = IfBlock->getTerminator();
2047	BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
2048	OldBr->eraseFromParent();
2049	IfBlock = NewIfBlock;
2050	}
2051	}
2052	}
2053	}
2054
2055	static Instruction getFirstInst(Instruction FirstInst, Value *V,
2056	Instruction *Loc) {
2057	if (FirstInst)
2058	return FirstInst;
2059	if (Instruction *I = dyn_cast<Instruction>(V))
2060	return I->getParent() == Loc->getParent() ? I : nullptr;
2061	return nullptr;
2062	}
2063
2064	std::pair<Instruction , Instruction >
2065	InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
2066	Instruction *tnullptr = nullptr;
2067	if (!Legal->mustCheckStrides())
2068	return std::pair<Instruction , Instruction >(tnullptr, tnullptr);
2069
2070	IRBuilder<> ChkBuilder(Loc);
2071
2072	// Emit checks.
2073	Value *Check = nullptr;
2074	Instruction *FirstInst = nullptr;
2075	for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
2076	SE = Legal->strides_end();
2077	SI != SE; ++SI) {
2078	Value Ptr = stripIntegerCast(SI);
2079	Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
2080	"stride.chk");
2081	// Store the first instruction we create.
2082	FirstInst = getFirstInst(FirstInst, C, Loc);
2083	if (Check)
2084	Check = ChkBuilder.CreateOr(Check, C);
2085	else
2086	Check = C;
2087	}
2088
2089	// We have to do this trickery because the IRBuilder might fold the check to a
2090	// constant expression in which case there is no Instruction anchored in a
2091	// the block.
2092	LLVMContext &Ctx = Loc->getContext();
2093	Instruction *TheCheck =
2094	BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
2095	ChkBuilder.Insert(TheCheck, "stride.not.one");
2096	FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
2097
2098	return std::make_pair(FirstInst, TheCheck);
2099	}
2100
2101	std::pair<Instruction , Instruction >
2102	InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
2103	LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
2104	Legal->getRuntimePointerCheck();
2105
2106	Instruction *tnullptr = nullptr;
2107	if (!PtrRtCheck->Need)
2108	return std::pair<Instruction , Instruction >(tnullptr, tnullptr);
2109
2110	unsigned NumPointers = PtrRtCheck->Pointers.size();
2111	SmallVector<TrackingVH<Value> , 2> Starts;
2112	SmallVector<TrackingVH<Value> , 2> Ends;
2113
2114	LLVMContext &Ctx = Loc->getContext();
2115	SCEVExpander Exp(*SE, "induction");
2116	Instruction *FirstInst = nullptr;
2117
2118	for (unsigned i = 0; i < NumPointers; ++i) {
2119	Value *Ptr = PtrRtCheck->Pointers[i];
2120	const SCEV *Sc = SE->getSCEV(Ptr);
2121
2122	if (SE->isLoopInvariant(Sc, OrigLoop)) {
2123	DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0)
2124	Ptr <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << Ptr <<"\n"; } } while (0);
2125	Starts.push_back(Ptr);
2126	Ends.push_back(Ptr);
2127	} else {
2128	DEBUG(dbgs() << "LV: Adding RT check for range:" << Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for range:" << Ptr << '\n'; } } while (0);
2129	unsigned AS = Ptr->getType()->getPointerAddressSpace();
2130
2131	// Use this type for pointer arithmetic.
2132	Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
2133
2134	Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
2135	Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
2136	Starts.push_back(Start);
2137	Ends.push_back(End);
2138	}
2139	}
2140
2141	IRBuilder<> ChkBuilder(Loc);
2142	// Our instructions might fold to a constant.
2143	Value *MemoryRuntimeCheck = nullptr;
2144	for (unsigned i = 0; i < NumPointers; ++i) {
2145	for (unsigned j = i+1; j < NumPointers; ++j) {
2146	// No need to check if two readonly pointers intersect.
2147	if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
2148	continue;
2149
2150	// Only need to check pointers between two different dependency sets.
2151	if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
2152	continue;
2153	// Only need to check pointers in the same alias set.
2154	if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
2155	continue;
2156
2157	unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
2158	unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
2159
2160	assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__))
2161	(AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__))
2162	"Trying to bounds check pointers with different address spaces")(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__));
2163
2164	Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
2165	Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
2166
2167	Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
2168	Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
2169	Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc");
2170	Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc");
2171
2172	Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
2173	FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
2174	Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
2175	FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
2176	Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
2177	FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2178	if (MemoryRuntimeCheck) {
2179	IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
2180	"conflict.rdx");
2181	FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2182	}
2183	MemoryRuntimeCheck = IsConflict;
2184	}
2185	}
2186
2187	// We have to do this trickery because the IRBuilder might fold the check to a
2188	// constant expression in which case there is no Instruction anchored in a
2189	// the block.
2190	Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
2191	ConstantInt::getTrue(Ctx));
2192	ChkBuilder.Insert(Check, "memcheck.conflict");
2193	FirstInst = getFirstInst(FirstInst, Check, Loc);
2194	return std::make_pair(FirstInst, Check);
2195	}
2196
2197	void InnerLoopVectorizer::createEmptyLoop() {
2198	/*
2199	In this function we generate a new loop. The new loop will contain
2200	the vectorized instructions while the old loop will continue to run the
2201	scalar remainder.
2202
2203	[ ] <-- Back-edge taken count overflow check.
2204	/ \|
2205	/ v
2206	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
2207	\| / \|
2208	\| / v
2209	\|\| [ ] <-- vector pre header.
2210	\|\| \|
2211	\|\| v
2212	\|\| [ ] \
2213	\|\| [ ]_\| <-- vector loop.
2214	\|\| \|
2215	\| \ v
2216	\| >[ ] <--- middle-block.
2217	\| / \|
2218	\| / v
2219	-\|- >[ ] <--- new preheader.
2220	\| \|
2221	\| v
2222	\| [ ] \
2223	\| [ ]_\| <-- old scalar loop to handle remainder.
2224	\ \|
2225	\ v
2226	>[ ] <-- exit block.
2227	...
2228	*/
2229
2230	BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2231	BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
2232	BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2233	assert(BypassBlock && "Invalid loop structure")((BypassBlock && "Invalid loop structure") ? static_cast <void> (0) : __assert_fail ("BypassBlock && \"Invalid loop structure\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2233, __PRETTY_FUNCTION__));
2234	assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast <void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2234, __PRETTY_FUNCTION__));
2235
2236	// Some loops have a single integer induction variable, while other loops
2237	// don't. One example is c++ iterators that often have multiple pointer
2238	// induction variables. In the code below we also support a case where we
2239	// don't have a single induction variable.
2240	OldInduction = Legal->getInduction();
2241	Type *IdxTy = Legal->getWidestInductionType();
2242
2243	// Find the loop boundaries.
2244	const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
2245	assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count")((ExitCount != SE->getCouldNotCompute() && "Invalid loop count" ) ? static_cast<void> (0) : __assert_fail ("ExitCount != SE->getCouldNotCompute() && \"Invalid loop count\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2245, __PRETTY_FUNCTION__));
2246
2247	// The exit count might have the type of i64 while the phi is i32. This can
2248	// happen if we have an induction variable that is sign extended before the
2249	// compare. The only way that we get a backedge taken count is that the
2250	// induction variable was signed and as such will not overflow. In such a case
2251	// truncation is legal.
2252	if (ExitCount->getType()->getPrimitiveSizeInBits() >
2253	IdxTy->getPrimitiveSizeInBits())
2254	ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
2255
2256	const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
2257	// Get the total trip count from the count by adding 1.
2258	ExitCount = SE->getAddExpr(BackedgeTakeCount,
2259	SE->getConstant(BackedgeTakeCount->getType(), 1));
2260
2261	// Expand the trip count and place the new instructions in the preheader.
2262	// Notice that the pre-header does not change, only the loop body.
2263	SCEVExpander Exp(*SE, "induction");
2264
2265	// We need to test whether the backedge-taken count is uint##_max. Adding one
2266	// to it will cause overflow and an incorrect loop trip count in the vector
2267	// body. In case of overflow we want to directly jump to the scalar remainder
2268	// loop.
2269	Value *BackedgeCount =
2270	Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
2271	BypassBlock->getTerminator());
2272	if (BackedgeCount->getType()->isPointerTy())
2273	BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
2274	"backedge.ptrcnt.to.int",
2275	BypassBlock->getTerminator());
2276	Instruction *CheckBCOverflow =
2277	CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
2278	Constant::getAllOnesValue(BackedgeCount->getType()),
2279	"backedge.overflow", BypassBlock->getTerminator());
2280
2281	// The loop index does not have to start at Zero. Find the original start
2282	// value from the induction PHI node. If we don't have an induction variable
2283	// then we know that it starts at zero.
2284	Builder.SetInsertPoint(BypassBlock->getTerminator());
2285	Value *StartIdx = ExtendedIdx = OldInduction ?
2286	Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
2287	IdxTy):
2288	ConstantInt::get(IdxTy, 0);
2289
2290	// We need an instruction to anchor the overflow check on. StartIdx needs to
2291	// be defined before the overflow check branch. Because the scalar preheader
2292	// is going to merge the start index and so the overflow branch block needs to
2293	// contain a definition of the start index.
2294	Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
2295	StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
2296	BypassBlock->getTerminator());
2297
2298	// Count holds the overall loop count (N).
2299	Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2300	BypassBlock->getTerminator());
2301
2302	LoopBypassBlocks.push_back(BypassBlock);
2303
2304	// Split the single block loop into the two loop structure described above.
2305	BasicBlock *VectorPH =
2306	BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
2307	BasicBlock *VecBody =
2308	VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2309	BasicBlock *MiddleBlock =
2310	VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2311	BasicBlock *ScalarPH =
2312	MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2313
2314	// Create and register the new vector loop.
2315	Loop* Lp = new Loop();
2316	Loop *ParentLoop = OrigLoop->getParentLoop();
2317
2318	// Insert the new loop into the loop nest and register the new basic blocks
2319	// before calling any utilities such as SCEV that require valid LoopInfo.
2320	if (ParentLoop) {
2321	ParentLoop->addChildLoop(Lp);
2322	ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
2323	ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
2324	ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
2325	} else {
2326	LI->addTopLevelLoop(Lp);
2327	}
2328	Lp->addBasicBlockToLoop(VecBody, LI->getBase());
2329
2330	// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
2331	// inside the loop.
2332	Builder.SetInsertPoint(VecBody->getFirstNonPHI());
2333
2334	// Generate the induction variable.
2335	setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
2336	Induction = Builder.CreatePHI(IdxTy, 2, "index");
2337	// The loop step is equal to the vectorization factor (num of SIMD elements)
2338	// times the unroll factor (num of SIMD instructions).
2339	Constant Step = ConstantInt::get(IdxTy, VF UF);
2340
2341	// This is the IR builder that we use to add all of the logic for bypassing
2342	// the new vector loop.
2343	IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
2344	setDebugLocFromInst(BypassBuilder,
2345	getDebugLocFromInstOrOperands(OldInduction));
2346
2347	// We may need to extend the index in case there is a type mismatch.
2348	// We know that the count starts at zero and does not overflow.
2349	if (Count->getType() != IdxTy) {
2350	// The exit count can be of pointer type. Convert it to the correct
2351	// integer type.
2352	if (ExitCount->getType()->isPointerTy())
2353	Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
2354	else
2355	Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
2356	}
2357
2358	// Add the start index to the loop count to get the new end index.
2359	Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
2360
2361	// Now we need to generate the expression for N - (N % VF), which is
2362	// the part that the vectorized body will execute.
2363	Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
2364	Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
2365	Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
2366	"end.idx.rnd.down");
2367
2368	// Now, compare the new count to zero. If it is zero skip the vector loop and
2369	// jump to the scalar loop.
2370	Value *Cmp =
2371	BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
2372
2373	BasicBlock *LastBypassBlock = BypassBlock;
2374
2375	// Generate code to check that the loops trip count that we computed by adding
2376	// one to the backedge-taken count will not overflow.
2377	{
2378	auto PastOverflowCheck =
2379	std::next(BasicBlock::iterator(OverflowCheckAnchor));
2380	BasicBlock *CheckBlock =
2381	LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
2382	if (ParentLoop)
2383	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2384	LoopBypassBlocks.push_back(CheckBlock);
2385	Instruction *OldTerm = LastBypassBlock->getTerminator();
2386	BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
2387	OldTerm->eraseFromParent();
2388	LastBypassBlock = CheckBlock;
2389	}
2390
2391	// Generate the code to check that the strides we assumed to be one are really
2392	// one. We want the new basic block to start at the first instruction in a
2393	// sequence of instructions that form a check.
2394	Instruction *StrideCheck;
2395	Instruction *FirstCheckInst;
2396	std::tie(FirstCheckInst, StrideCheck) =
2397	addStrideCheck(LastBypassBlock->getTerminator());
2398	if (StrideCheck) {
2399	// Create a new block containing the stride check.
2400	BasicBlock *CheckBlock =
2401	LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
2402	if (ParentLoop)
2403	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2404	LoopBypassBlocks.push_back(CheckBlock);
2405
2406	// Replace the branch into the memory check block with a conditional branch
2407	// for the "few elements case".
2408	Instruction *OldTerm = LastBypassBlock->getTerminator();
2409	BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
2410	OldTerm->eraseFromParent();
2411
2412	Cmp = StrideCheck;
2413	LastBypassBlock = CheckBlock;
2414	}
2415
2416	// Generate the code that checks in runtime if arrays overlap. We put the
2417	// checks into a separate block to make the more common case of few elements
2418	// faster.
2419	Instruction *MemRuntimeCheck;
2420	std::tie(FirstCheckInst, MemRuntimeCheck) =
2421	addRuntimeCheck(LastBypassBlock->getTerminator());
2422	if (MemRuntimeCheck) {
2423	// Create a new block containing the memory check.
2424	BasicBlock *CheckBlock =
2425	LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck");
2426	if (ParentLoop)
2427	ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
2428	LoopBypassBlocks.push_back(CheckBlock);
2429
2430	// Replace the branch into the memory check block with a conditional branch
2431	// for the "few elements case".
2432	Instruction *OldTerm = LastBypassBlock->getTerminator();
2433	BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
2434	OldTerm->eraseFromParent();
2435
2436	Cmp = MemRuntimeCheck;
2437	LastBypassBlock = CheckBlock;
2438	}
2439
2440	LastBypassBlock->getTerminator()->eraseFromParent();
2441	BranchInst::Create(MiddleBlock, VectorPH, Cmp,
2442	LastBypassBlock);
2443
2444	// We are going to resume the execution of the scalar loop.
2445	// Go over all of the induction variables that we found and fix the
2446	// PHIs that are left in the scalar version of the loop.
2447	// The starting values of PHI nodes depend on the counter of the last
2448	// iteration in the vectorized loop.
2449	// If we come from a bypass edge then we need to start from the original
2450	// start value.
2451
2452	// This variable saves the new starting index for the scalar loop.
2453	PHINode *ResumeIndex = nullptr;
2454	LoopVectorizationLegality::InductionList::iterator I, E;
2455	LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2456	// Set builder to point to last bypass block.
2457	BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
2458	for (I = List->begin(), E = List->end(); I != E; ++I) {
2459	PHINode *OrigPhi = I->first;
2460	LoopVectorizationLegality::InductionInfo II = I->second;
2461
2462	Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
2463	PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
2464	MiddleBlock->getTerminator());
2465	// We might have extended the type of the induction variable but we need a
2466	// truncated version for the scalar loop.
2467	PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
2468	PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
2469	MiddleBlock->getTerminator()) : nullptr;
2470
2471	// Create phi nodes to merge from the backedge-taken check block.
2472	PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
2473	ScalarPH->getTerminator());
2474	BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
2475
2476	PHINode *BCTruncResumeVal = nullptr;
2477	if (OrigPhi == OldInduction) {
2478	BCTruncResumeVal =
2479	PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
2480	ScalarPH->getTerminator());
2481	BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
2482	}
2483
2484	Value *EndValue = nullptr;
2485	switch (II.IK) {
2486	case LoopVectorizationLegality::IK_NoInduction:
2487	llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2487);
2488	case LoopVectorizationLegality::IK_IntInduction: {
2489	// Handle the integer induction counter.
2490	assert(OrigPhi->getType()->isIntegerTy() && "Invalid type")((OrigPhi->getType()->isIntegerTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("OrigPhi->getType()->isIntegerTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2490, __PRETTY_FUNCTION__));
2491
2492	// We have the canonical induction variable.
2493	if (OrigPhi == OldInduction) {
2494	// Create a truncated version of the resume value for the scalar loop,
2495	// we might have promoted the type to a larger width.
2496	EndValue =
2497	BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
2498	// The new PHI merges the original incoming value, in case of a bypass,
2499	// or the value at the end of the vectorized loop.
2500	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2501	TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2502	TruncResumeVal->addIncoming(EndValue, VecBody);
2503
2504	BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2505
2506	// We know what the end value is.
2507	EndValue = IdxEndRoundDown;
2508	// We also know which PHI node holds it.
2509	ResumeIndex = ResumeVal;
2510	break;
2511	}
2512
2513	// Not the canonical induction variable - add the vector loop count to the
2514	// start value.
2515	Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2516	II.StartValue->getType(),
2517	"cast.crd");
2518	EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
2519	break;
2520	}
2521	case LoopVectorizationLegality::IK_ReverseIntInduction: {
2522	// Convert the CountRoundDown variable to the PHI size.
2523	Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2524	II.StartValue->getType(),
2525	"cast.crd");
2526	// Handle reverse integer induction counter.
2527	EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
2528	break;
2529	}
2530	case LoopVectorizationLegality::IK_PtrInduction: {
2531	// For pointer induction variables, calculate the offset using
2532	// the end index.
2533	EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
2534	"ptr.ind.end");
2535	break;
2536	}
2537	case LoopVectorizationLegality::IK_ReversePtrInduction: {
2538	// The value at the end of the loop for the reverse pointer is calculated
2539	// by creating a GEP with a negative index starting from the start value.
2540	Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
2541	Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
2542	"rev.ind.end");
2543	EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
2544	"rev.ptr.ind.end");
2545	break;
2546	}
2547	}// end of case
2548
2549	// The new PHI merges the original incoming value, in case of a bypass,
2550	// or the value at the end of the vectorized loop.
2551	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
2552	if (OrigPhi == OldInduction)
2553	ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
2554	else
2555	ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2556	}
2557	ResumeVal->addIncoming(EndValue, VecBody);
2558
2559	// Fix the scalar body counter (PHI node).
2560	unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2561
2562	// The old induction's phi node in the scalar body needs the truncated
2563	// value.
2564	if (OrigPhi == OldInduction) {
2565	BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
2566	OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
2567	} else {
2568	BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2569	OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2570	}
2571	}
2572
2573	// If we are generating a new induction variable then we also need to
2574	// generate the code that calculates the exit value. This value is not
2575	// simply the end of the counter because we may skip the vectorized body
2576	// in case of a runtime check.
2577	if (!OldInduction){
2578	assert(!ResumeIndex && "Unexpected resume value found")((!ResumeIndex && "Unexpected resume value found") ? static_cast <void> (0) : __assert_fail ("!ResumeIndex && \"Unexpected resume value found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2578, __PRETTY_FUNCTION__));
2579	ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
2580	MiddleBlock->getTerminator());
2581	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2582	ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
2583	ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
2584	}
2585
2586	// Make sure that we found the index where scalar loop needs to continue.
2587	assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2588, __PRETTY_FUNCTION__))
2588	"Invalid resume Index")((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2588, __PRETTY_FUNCTION__));
2589
2590	// Add a check in the middle block to see if we have completed
2591	// all of the iterations in the first vector loop.
2592	// If (N - N%VF) == N, then we don't need to run the remainder.
2593	Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
2594	ResumeIndex, "cmp.n",
2595	MiddleBlock->getTerminator());
2596
2597	BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
2598	// Remove the old terminator.
2599	MiddleBlock->getTerminator()->eraseFromParent();
2600
2601	// Create i+1 and fill the PHINode.
2602	Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
2603	Induction->addIncoming(StartIdx, VectorPH);
2604	Induction->addIncoming(NextIdx, VecBody);
2605	// Create the compare.
2606	Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
2607	Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
2608
2609	// Now we have two terminators. Remove the old one from the block.
2610	VecBody->getTerminator()->eraseFromParent();
2611
2612	// Get ready to start creating new instructions into the vectorized body.
2613	Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
2614
2615	// Save the state.
2616	LoopVectorPreHeader = VectorPH;
2617	LoopScalarPreHeader = ScalarPH;
2618	LoopMiddleBlock = MiddleBlock;
2619	LoopExitBlock = ExitBlock;
2620	LoopVectorBody.push_back(VecBody);
2621	LoopScalarBody = OldBasicBlock;
2622
2623	LoopVectorizeHints Hints(Lp, true);
2624	Hints.setAlreadyVectorized();
2625	}
2626
2627	/// This function returns the identity element (or neutral element) for
2628	/// the operation K.
2629	Constant*
2630	LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
2631	switch (K) {
2632	case RK_IntegerXor:
2633	case RK_IntegerAdd:
2634	case RK_IntegerOr:
2635	// Adding, Xoring, Oring zero to a number does not change it.
2636	return ConstantInt::get(Tp, 0);
2637	case RK_IntegerMult:
2638	// Multiplying a number by 1 does not change it.
2639	return ConstantInt::get(Tp, 1);
2640	case RK_IntegerAnd:
2641	// AND-ing a number with an all-1 value does not change it.
2642	return ConstantInt::get(Tp, -1, true);
2643	case RK_FloatMult:
2644	// Multiplying a number by 1 does not change it.
2645	return ConstantFP::get(Tp, 1.0L);
2646	case RK_FloatAdd:
2647	// Adding zero to a number does not change it.
2648	return ConstantFP::get(Tp, 0.0L);
2649	default:
2650	llvm_unreachable("Unknown reduction kind")::llvm::llvm_unreachable_internal("Unknown reduction kind", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2650);
2651	}
2652	}
2653
2654	/// This function translates the reduction kind to an LLVM binary operator.
2655	static unsigned
2656	getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
2657	switch (Kind) {
2658	case LoopVectorizationLegality::RK_IntegerAdd:
2659	return Instruction::Add;
2660	case LoopVectorizationLegality::RK_IntegerMult:
2661	return Instruction::Mul;
2662	case LoopVectorizationLegality::RK_IntegerOr:
2663	return Instruction::Or;
2664	case LoopVectorizationLegality::RK_IntegerAnd:
2665	return Instruction::And;
2666	case LoopVectorizationLegality::RK_IntegerXor:
2667	return Instruction::Xor;
2668	case LoopVectorizationLegality::RK_FloatMult:
2669	return Instruction::FMul;
2670	case LoopVectorizationLegality::RK_FloatAdd:
2671	return Instruction::FAdd;
2672	case LoopVectorizationLegality::RK_IntegerMinMax:
2673	return Instruction::ICmp;
2674	case LoopVectorizationLegality::RK_FloatMinMax:
2675	return Instruction::FCmp;
2676	default:
2677	llvm_unreachable("Unknown reduction operation")::llvm::llvm_unreachable_internal("Unknown reduction operation" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2677);
2678	}
2679	}
2680
2681	Value *createMinMaxOp(IRBuilder<> &Builder,
2682	LoopVectorizationLegality::MinMaxReductionKind RK,
2683	Value *Left,
2684	Value *Right) {
2685	CmpInst::Predicate P = CmpInst::ICMP_NE;
2686	switch (RK) {
2687	default:
2688	llvm_unreachable("Unknown min/max reduction kind")::llvm::llvm_unreachable_internal("Unknown min/max reduction kind" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2688);
2689	case LoopVectorizationLegality::MRK_UIntMin:
2690	P = CmpInst::ICMP_ULT;
2691	break;
2692	case LoopVectorizationLegality::MRK_UIntMax:
2693	P = CmpInst::ICMP_UGT;
2694	break;
2695	case LoopVectorizationLegality::MRK_SIntMin:
2696	P = CmpInst::ICMP_SLT;
2697	break;
2698	case LoopVectorizationLegality::MRK_SIntMax:
2699	P = CmpInst::ICMP_SGT;
2700	break;
2701	case LoopVectorizationLegality::MRK_FloatMin:
2702	P = CmpInst::FCMP_OLT;
2703	break;
2704	case LoopVectorizationLegality::MRK_FloatMax:
2705	P = CmpInst::FCMP_OGT;
2706	break;
2707	}
2708
2709	Value *Cmp;
2710	if (RK == LoopVectorizationLegality::MRK_FloatMin \|\|
2711	RK == LoopVectorizationLegality::MRK_FloatMax)
2712	Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
2713	else
2714	Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
2715
2716	Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
2717	return Select;
2718	}
2719
2720	namespace {
2721	struct CSEDenseMapInfo {
2722	static bool canHandle(Instruction *I) {
2723	return isa<InsertElementInst>(I) \|\| isa<ExtractElementInst>(I) \|\|
2724	isa<ShuffleVectorInst>(I) \|\| isa<GetElementPtrInst>(I);
2725	}
2726	static inline Instruction *getEmptyKey() {
2727	return DenseMapInfo<Instruction *>::getEmptyKey();
2728	}
2729	static inline Instruction *getTombstoneKey() {
2730	return DenseMapInfo<Instruction *>::getTombstoneKey();
2731	}
2732	static unsigned getHashValue(Instruction *I) {
2733	assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast <void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2733, __PRETTY_FUNCTION__));
2734	return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2735	I->value_op_end()));
2736	}
2737	static bool isEqual(Instruction LHS, Instruction RHS) {
2738	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
2739	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
2740	return LHS == RHS;
2741	return LHS->isIdenticalTo(RHS);
2742	}
2743	};
2744	}
2745
2746	/// \brief Check whether this block is a predicated block.
2747	/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
2748	/// = ...; " blocks. We start with one vectorized basic block. For every
2749	/// conditional block we split this vectorized block. Therefore, every second
2750	/// block will be a predicated one.
2751	static bool isPredicatedBlock(unsigned BlockNum) {
2752	return BlockNum % 2;
2753	}
2754
2755	///\brief Perform cse of induction variable instructions.
2756	static void cse(SmallVector<BasicBlock *, 4> &BBs) {
2757	// Perform simple cse.
2758	SmallDenseMap<Instruction , Instruction , 4, CSEDenseMapInfo> CSEMap;
2759	for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
2760	BasicBlock *BB = BBs[i];
2761	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2762	Instruction *In = I++;
2763
2764	if (!CSEDenseMapInfo::canHandle(In))
2765	continue;
2766
2767	// Check if we can replace this instruction with any of the
2768	// visited instructions.
2769	if (Instruction *V = CSEMap.lookup(In)) {
2770	In->replaceAllUsesWith(V);
2771	In->eraseFromParent();
2772	continue;
2773	}
2774	// Ignore instructions in conditional blocks. We create "if (pred) a[i] =
2775	// ...;" blocks for predicated stores. Every second block is a predicated
2776	// block.
2777	if (isPredicatedBlock(i))
2778	continue;
2779
2780	CSEMap[In] = In;
2781	}
2782	}
2783	}
2784
2785	/// \brief Adds a 'fast' flag to floating point operations.
2786	static Value addFastMathFlag(Value V) {
2787	if (isa<FPMathOperator>(V)){
2788	FastMathFlags Flags;
2789	Flags.setUnsafeAlgebra();
2790	cast<Instruction>(V)->setFastMathFlags(Flags);
2791	}
2792	return V;
2793	}
2794
2795	void InnerLoopVectorizer::vectorizeLoop() {
2796	//===------------------------------------------------===//
2797	//
2798	// Notice: any optimization or new instruction that go
2799	// into the code below should be also be implemented in
2800	// the cost-model.
2801	//
2802	//===------------------------------------------------===//
2803	Constant *Zero = Builder.getInt32(0);
2804
2805	// In order to support reduction variables we need to be able to vectorize
2806	// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
2807	// stages. First, we create a new vector PHI node with no incoming edges.
2808	// We use this value when we vectorize all of the instructions that use the
2809	// PHI. Next, after all of the instructions in the block are complete we
2810	// add the new incoming edges to the PHI. At this point all of the
2811	// instructions in the basic block are vectorized, so we can use them to
2812	// construct the PHI.
2813	PhiVector RdxPHIsToFix;
2814
2815	// Scan the loop in a topological order to ensure that defs are vectorized
2816	// before users.
2817	LoopBlocksDFS DFS(OrigLoop);
2818	DFS.perform(LI);
2819
2820	// Vectorize all of the blocks in the original loop.
2821	for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
2822	be = DFS.endRPO(); bb != be; ++bb)
2823	vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
2824
2825	// At this point every instruction in the original loop is widened to
2826	// a vector form. We are almost done. Now, we need to fix the PHI nodes
2827	// that we vectorized. The PHI nodes are currently empty because we did
2828	// not want to introduce cycles. Notice that the remaining PHI nodes
2829	// that we need to fix are reduction variables.
2830
2831	// Create the 'reduced' values for each of the induction vars.
2832	// The reduced values are the vector values that we scalarize and combine
2833	// after the loop is finished.
2834	for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
2835	it != e; ++it) {
2836	PHINode RdxPhi = it;
2837	assert(RdxPhi && "Unable to recover vectorized PHI")((RdxPhi && "Unable to recover vectorized PHI") ? static_cast <void> (0) : __assert_fail ("RdxPhi && \"Unable to recover vectorized PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2837, __PRETTY_FUNCTION__));
2838
2839	// Find the reduction variable descriptor.
2840	assert(Legal->getReductionVars()->count(RdxPhi) &&((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2841, __PRETTY_FUNCTION__))
2841	"Unable to find the reduction variable")((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2841, __PRETTY_FUNCTION__));
2842	LoopVectorizationLegality::ReductionDescriptor RdxDesc =
2843	(*Legal->getReductionVars())[RdxPhi];
2844
2845	setDebugLocFromInst(Builder, RdxDesc.StartValue);
2846
2847	// We need to generate a reduction vector from the incoming scalar.
2848	// To do so, we need to generate the 'identity' vector and override
2849	// one of the elements with the incoming scalar reduction. We need
2850	// to do it in the vector-loop preheader.
2851	Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
2852
2853	// This is the vector-clone of the value that leaves the loop.
2854	VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
2855	Type *VecTy = VectorExit[0]->getType();
2856
2857	// Find the reduction identity variable. Zero for addition, or, xor,
2858	// one for multiplication, -1 for And.
2859	Value *Identity;
2860	Value *VectorStart;
2861	if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax \|\|
2862	RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
2863	// MinMax reduction have the start value as their identify.
2864	if (VF == 1) {
2865	VectorStart = Identity = RdxDesc.StartValue;
2866	} else {
2867	VectorStart = Identity = Builder.CreateVectorSplat(VF,
2868	RdxDesc.StartValue,
2869	"minmax.ident");
2870	}
2871	} else {
2872	// Handle other reduction kinds:
2873	Constant *Iden =
2874	LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
2875	VecTy->getScalarType());
2876	if (VF == 1) {
2877	Identity = Iden;
2878	// This vector is the Identity vector where the first element is the
2879	// incoming scalar reduction.
2880	VectorStart = RdxDesc.StartValue;
2881	} else {
2882	Identity = ConstantVector::getSplat(VF, Iden);
2883
2884	// This vector is the Identity vector where the first element is the
2885	// incoming scalar reduction.
2886	VectorStart = Builder.CreateInsertElement(Identity,
2887	RdxDesc.StartValue, Zero);
2888	}
2889	}
2890
2891	// Fix the vector-loop phi.
2892
2893	// Reductions do not have to start at zero. They can start with
2894	// any loop invariant values.
2895	VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
2896	BasicBlock *Latch = OrigLoop->getLoopLatch();
2897	Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
2898	VectorParts &Val = getVectorValue(LoopVal);
2899	for (unsigned part = 0; part < UF; ++part) {
2900	// Make sure to add the reduction stat value only to the
2901	// first unroll part.
2902	Value *StartVal = (part == 0) ? VectorStart : Identity;
2903	cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
2904	LoopVectorPreHeader);
2905	cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
2906	LoopVectorBody.back());
2907	}
2908
2909	// Before each round, move the insertion point right between
2910	// the PHIs and the values we are going to write.
2911	// This allows us to write both PHINodes and the extractelement
2912	// instructions.
2913	Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
2914
2915	VectorParts RdxParts;
2916	setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
2917	for (unsigned part = 0; part < UF; ++part) {
2918	// This PHINode contains the vectorized reduction variable, or
2919	// the initial value vector, if we bypass the vector loop.
2920	VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
2921	PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
2922	Value *StartVal = (part == 0) ? VectorStart : Identity;
2923	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2924	NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
2925	NewPhi->addIncoming(RdxExitVal[part],
2926	LoopVectorBody.back());
2927	RdxParts.push_back(NewPhi);
2928	}
2929
2930	// Reduce all of the unrolled parts into a single vector.
2931	Value *ReducedPartRdx = RdxParts[0];
2932	unsigned Op = getReductionBinOp(RdxDesc.Kind);
2933	setDebugLocFromInst(Builder, ReducedPartRdx);
2934	for (unsigned part = 1; part < UF; ++part) {
2935	if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2936	// Floating point operations had to be 'fast' to enable the reduction.
2937	ReducedPartRdx = addFastMathFlag(
2938	Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
2939	ReducedPartRdx, "bin.rdx"));
2940	else
2941	ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
2942	ReducedPartRdx, RdxParts[part]);
2943	}
2944
2945	if (VF > 1) {
2946	// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
2947	// and vector ops, reducing the set of values being computed by half each
2948	// round.
2949	assert(isPowerOf2_32(VF) &&((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2950, __PRETTY_FUNCTION__))
2950	"Reduction emission only supported for pow2 vectors!")((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2950, __PRETTY_FUNCTION__));
2951	Value *TmpVec = ReducedPartRdx;
2952	SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
2953	for (unsigned i = VF; i != 1; i >>= 1) {
2954	// Move the upper half of the vector to the lower half.
2955	for (unsigned j = 0; j != i/2; ++j)
2956	ShuffleMask[j] = Builder.getInt32(i/2 + j);
2957
2958	// Fill the rest of the mask with undef.
2959	std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
2960	UndefValue::get(Builder.getInt32Ty()));
2961
2962	Value *Shuf =
2963	Builder.CreateShuffleVector(TmpVec,
2964	UndefValue::get(TmpVec->getType()),
2965	ConstantVector::get(ShuffleMask),
2966	"rdx.shuf");
2967
2968	if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2969	// Floating point operations had to be 'fast' to enable the reduction.
2970	TmpVec = addFastMathFlag(Builder.CreateBinOp(
2971	(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
2972	else
2973	TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
2974	}
2975
2976	// The result is in the first element of the vector.
2977	ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
2978	Builder.getInt32(0));
2979	}
2980
2981	// Create a phi node that merges control-flow from the backedge-taken check
2982	// block and the middle block.
2983	PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
2984	LoopScalarPreHeader->getTerminator());
2985	BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
2986	BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
2987
2988	// Now, we need to fix the users of the reduction variable
2989	// inside and outside of the scalar remainder loop.
2990	// We know that the loop is in LCSSA form. We need to update the
2991	// PHI nodes in the exit blocks.
2992	for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2993	LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2994	PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2995	if (!LCSSAPhi) break;
2996
2997	// All PHINodes need to have a single entry edge, or two if
2998	// we already fixed them.
2999	assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI" ) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi->getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2999, __PRETTY_FUNCTION__));
3000
3001	// We found our reduction value exit-PHI. Update it with the
3002	// incoming bypass edge.
3003	if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
3004	// Add an edge coming from the bypass.
3005	LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3006	break;
3007	}
3008	}// end of the LCSSA phi scan.
3009
3010	// Fix the scalar loop reduction variable with the incoming reduction sum
3011	// from the vector body and from the backedge value.
3012	int IncomingEdgeBlockIdx =
3013	(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
3014	assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index" ) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3014, __PRETTY_FUNCTION__));
3015	// Pick the other block.
3016	int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3017	(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3018	(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
3019	}// end of for each redux variable.
3020
3021	fixLCSSAPHIs();
3022
3023	// Remove redundant induction instructions.
3024	cse(LoopVectorBody);
3025	}
3026
3027	void InnerLoopVectorizer::fixLCSSAPHIs() {
3028	for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
3029	LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
3030	PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
3031	if (!LCSSAPhi) break;
3032	if (LCSSAPhi->getNumIncomingValues() == 1)
3033	LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
3034	LoopMiddleBlock);
3035	}
3036	}
3037
3038	InnerLoopVectorizer::VectorParts
3039	InnerLoopVectorizer::createEdgeMask(BasicBlock Src, BasicBlock Dst) {
3040	assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3041, __PRETTY_FUNCTION__))
3041	"Invalid edge")((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3041, __PRETTY_FUNCTION__));
3042
3043	// Look for cached value.
3044	std::pair<BasicBlock, BasicBlock> Edge(Src, Dst);
3045	EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
3046	if (ECEntryIt != MaskCache.end())
3047	return ECEntryIt->second;
3048
3049	VectorParts SrcMask = createBlockInMask(Src);
3050
3051	// The terminator has to be a branch inst!
3052	BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
3053	assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast< void> (0) : __assert_fail ("BI && \"Unexpected terminator found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3053, __PRETTY_FUNCTION__));
3054
3055	if (BI->isConditional()) {
3056	VectorParts EdgeMask = getVectorValue(BI->getCondition());
3057
3058	if (BI->getSuccessor(0) != Dst)
3059	for (unsigned part = 0; part < UF; ++part)
3060	EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
3061
3062	for (unsigned part = 0; part < UF; ++part)
3063	EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
3064
3065	MaskCache[Edge] = EdgeMask;
3066	return EdgeMask;
3067	}
3068
3069	MaskCache[Edge] = SrcMask;
3070	return SrcMask;
3071	}
3072
3073	InnerLoopVectorizer::VectorParts
3074	InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
3075	assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop" ) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3075, __PRETTY_FUNCTION__));
3076
3077	// Loop incoming mask is all-one.
3078	if (OrigLoop->getHeader() == BB) {
3079	Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
3080	return getVectorValue(C);
3081	}
3082
3083	// This is the block mask. We OR all incoming edges, and with zero.
3084	Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
3085	VectorParts BlockMask = getVectorValue(Zero);
3086
3087	// For each pred:
3088	for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
3089	VectorParts EM = createEdgeMask(*it, BB);
3090	for (unsigned part = 0; part < UF; ++part)
3091	BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
3092	}
3093
3094	return BlockMask;
3095	}
3096
3097	void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
3098	InnerLoopVectorizer::VectorParts &Entry,
3099	unsigned UF, unsigned VF, PhiVector *PV) {
3100	PHINode* P = cast<PHINode>(PN);
3101	// Handle reduction variables:
3102	if (Legal->getReductionVars()->count(P)) {
3103	for (unsigned part = 0; part < UF; ++part) {
3104	// This is phase one of vectorizing PHIs.
3105	Type *VecTy = (VF == 1) ? PN->getType() :
3106	VectorType::get(PN->getType(), VF);
3107	Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
3108	LoopVectorBody.back()-> getFirstInsertionPt());
3109	}
3110	PV->push_back(P);
3111	return;
3112	}
3113
3114	setDebugLocFromInst(Builder, P);
3115	// Check for PHI nodes that are lowered to vector selects.
3116	if (P->getParent() != OrigLoop->getHeader()) {
3117	// We know that all PHIs in non-header blocks are converted into
3118	// selects, so we don't have to worry about the insertion order and we
3119	// can just use the builder.
3120	// At this point we generate the predication tree. There may be
3121	// duplications since this is a simple recursive scan, but future
3122	// optimizations will clean it up.
3123
3124	unsigned NumIncoming = P->getNumIncomingValues();
3125
3126	// Generate a sequence of selects of the form:
3127	// SELECT(Mask3, In3,
3128	// SELECT(Mask2, In2,
3129	// ( ...)))
3130	for (unsigned In = 0; In < NumIncoming; In++) {
3131	VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
3132	P->getParent());
3133	VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
3134
3135	for (unsigned part = 0; part < UF; ++part) {
3136	// We might have single edge PHIs (blocks) - use an identity
3137	// 'select' for the first PHI operand.
3138	if (In == 0)
3139	Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3140	In0[part]);
3141	else
3142	// Select between the current value and the previous incoming edge
3143	// based on the incoming mask.
3144	Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3145	Entry[part], "predphi");
3146	}
3147	}
3148	return;
3149	}
3150
3151	// This PHINode must be an induction variable.
3152	// Make sure that we know about it.
3153	assert(Legal->getInductionVars()->count(P) &&((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3154, __PRETTY_FUNCTION__))
3154	"Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3154, __PRETTY_FUNCTION__));
3155
3156	LoopVectorizationLegality::InductionInfo II =
3157	Legal->getInductionVars()->lookup(P);
3158
3159	switch (II.IK) {
3160	case LoopVectorizationLegality::IK_NoInduction:
3161	llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3161);
3162	case LoopVectorizationLegality::IK_IntInduction: {
3163	assert(P->getType() == II.StartValue->getType() && "Types must match")((P->getType() == II.StartValue->getType() && "Types must match" ) ? static_cast<void> (0) : __assert_fail ("P->getType() == II.StartValue->getType() && \"Types must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3163, __PRETTY_FUNCTION__));
3164	Type *PhiTy = P->getType();
3165	Value *Broadcasted;
3166	if (P == OldInduction) {
3167	// Handle the canonical induction variable. We might have had to
3168	// extend the type.
3169	Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
3170	} else {
3171	// Handle other induction variables that are now based on the
3172	// canonical one.
3173	Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
3174	"normalized.idx");
3175	NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
3176	Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
3177	"offset.idx");
3178	}
3179	Broadcasted = getBroadcastInstrs(Broadcasted);
3180	// After broadcasting the induction variable we need to make the vector
3181	// consecutive by adding 0, 1, 2, etc.
3182	for (unsigned part = 0; part < UF; ++part)
3183	Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
3184	return;
3185	}
3186	case LoopVectorizationLegality::IK_ReverseIntInduction:
3187	case LoopVectorizationLegality::IK_PtrInduction:
3188	case LoopVectorizationLegality::IK_ReversePtrInduction:
3189	// Handle reverse integer and pointer inductions.
3190	Value *StartIdx = ExtendedIdx;
3191	// This is the normalized GEP that starts counting at zero.
3192	Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
3193	"normalized.idx");
3194
3195	// Handle the reverse integer induction variable case.
3196	if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
3197	IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
3198	Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
3199	"resize.norm.idx");
3200	Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
3201	"reverse.idx");
3202
3203	// This is a new value so do not hoist it out.
3204	Value *Broadcasted = getBroadcastInstrs(ReverseInd);
3205	// After broadcasting the induction variable we need to make the
3206	// vector consecutive by adding ... -3, -2, -1, 0.
3207	for (unsigned part = 0; part < UF; ++part)
3208	Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
3209	true);
3210	return;
3211	}
3212
3213	// Handle the pointer induction variable case.
3214	assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type." ) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3214, __PRETTY_FUNCTION__));
3215
3216	// Is this a reverse induction ptr or a consecutive induction ptr.
3217	bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
3218	II.IK);
3219
3220	// This is the vector of results. Notice that we don't generate
3221	// vector geps because scalar geps result in better code.
3222	for (unsigned part = 0; part < UF; ++part) {
3223	if (VF == 1) {
3224	int EltIndex = (part) * (Reverse ? -1 : 1);
3225	Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
3226	Value *GlobalIdx;
3227	if (Reverse)
3228	GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
3229	else
3230	GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
3231
3232	Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
3233	"next.gep");
3234	Entry[part] = SclrGep;
3235	continue;
3236	}
3237
3238	Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
3239	for (unsigned int i = 0; i < VF; ++i) {
3240	int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
3241	Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
3242	Value *GlobalIdx;
3243	if (!Reverse)
3244	GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
3245	else
3246	GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
3247
3248	Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
3249	"next.gep");
3250	VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
3251	Builder.getInt32(i),
3252	"insert.gep");
3253	}
3254	Entry[part] = VecVal;
3255	}
3256	return;
3257	}
3258	}
3259
3260	void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock BB, PhiVector PV) {
3261	// For each instruction in the old loop.
3262	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3263	VectorParts &Entry = WidenMap.get(it);
3264	switch (it->getOpcode()) {
3265	case Instruction::Br:
3266	// Nothing to do for PHIs and BR, since we already took care of the
3267	// loop control flow instructions.
3268	continue;
3269	case Instruction::PHI:{
3270	// Vectorize PHINodes.
3271	widenPHIInstruction(it, Entry, UF, VF, PV);
3272	continue;
3273	}// End of PHI.
3274
3275	case Instruction::Add:
3276	case Instruction::FAdd:
3277	case Instruction::Sub:
3278	case Instruction::FSub:
3279	case Instruction::Mul:
3280	case Instruction::FMul:
3281	case Instruction::UDiv:
3282	case Instruction::SDiv:
3283	case Instruction::FDiv:
3284	case Instruction::URem:
3285	case Instruction::SRem:
3286	case Instruction::FRem:
3287	case Instruction::Shl:
3288	case Instruction::LShr:
3289	case Instruction::AShr:
3290	case Instruction::And:
3291	case Instruction::Or:
3292	case Instruction::Xor: {
3293	// Just widen binops.
3294	BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
3295	setDebugLocFromInst(Builder, BinOp);
3296	VectorParts &A = getVectorValue(it->getOperand(0));
3297	VectorParts &B = getVectorValue(it->getOperand(1));
3298
3299	// Use this vector value for all users of the original instruction.
3300	for (unsigned Part = 0; Part < UF; ++Part) {
3301	Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
3302
3303	if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3304	VecOp->copyIRFlags(BinOp);
3305
3306	Entry[Part] = V;
3307	}
3308
3309	propagateMetadata(Entry, it);
3310	break;
3311	}
3312	case Instruction::Select: {
3313	// Widen selects.
3314	// If the selector is loop invariant we can create a select
3315	// instruction with a scalar condition. Otherwise, use vector-select.
3316	bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
3317	OrigLoop);
3318	setDebugLocFromInst(Builder, it);
3319
3320	// The condition can be loop invariant but still defined inside the
3321	// loop. This means that we can't just use the original 'cond' value.
3322	// We have to take the 'vectorized' value and pick the first lane.
3323	// Instcombine will make this a no-op.
3324	VectorParts &Cond = getVectorValue(it->getOperand(0));
3325	VectorParts &Op0 = getVectorValue(it->getOperand(1));
3326	VectorParts &Op1 = getVectorValue(it->getOperand(2));
3327
3328	Value *ScalarCond = (VF == 1) ? Cond[0] :
3329	Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
3330
3331	for (unsigned Part = 0; Part < UF; ++Part) {
3332	Entry[Part] = Builder.CreateSelect(
3333	InvariantCond ? ScalarCond : Cond[Part],
3334	Op0[Part],
3335	Op1[Part]);
3336	}
3337
3338	propagateMetadata(Entry, it);
3339	break;
3340	}
3341
3342	case Instruction::ICmp:
3343	case Instruction::FCmp: {
3344	// Widen compares. Generate vector compares.
3345	bool FCmp = (it->getOpcode() == Instruction::FCmp);
3346	CmpInst *Cmp = dyn_cast<CmpInst>(it);
3347	setDebugLocFromInst(Builder, it);
3348	VectorParts &A = getVectorValue(it->getOperand(0));
3349	VectorParts &B = getVectorValue(it->getOperand(1));
3350	for (unsigned Part = 0; Part < UF; ++Part) {
3351	Value *C = nullptr;
3352	if (FCmp)
3353	C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
3354	else
3355	C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
3356	Entry[Part] = C;
3357	}
3358
3359	propagateMetadata(Entry, it);
3360	break;
3361	}
3362
3363	case Instruction::Store:
3364	case Instruction::Load:
3365	vectorizeMemoryInstruction(it);
3366	break;
3367	case Instruction::ZExt:
3368	case Instruction::SExt:
3369	case Instruction::FPToUI:
3370	case Instruction::FPToSI:
3371	case Instruction::FPExt:
3372	case Instruction::PtrToInt:
3373	case Instruction::IntToPtr:
3374	case Instruction::SIToFP:
3375	case Instruction::UIToFP:
3376	case Instruction::Trunc:
3377	case Instruction::FPTrunc:
3378	case Instruction::BitCast: {
3379	CastInst *CI = dyn_cast<CastInst>(it);
3380	setDebugLocFromInst(Builder, it);
3381	/// Optimize the special case where the source is the induction
3382	/// variable. Notice that we can only optimize the 'trunc' case
3383	/// because: a. FP conversions lose precision, b. sext/zext may wrap,
3384	/// c. other casts depend on pointer size.
3385	if (CI->getOperand(0) == OldInduction &&
3386	it->getOpcode() == Instruction::Trunc) {
3387	Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
3388	CI->getType());
3389	Value *Broadcasted = getBroadcastInstrs(ScalarCast);
3390	for (unsigned Part = 0; Part < UF; ++Part)
3391	Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
3392	propagateMetadata(Entry, it);
3393	break;
3394	}
3395	/// Vectorize casts.
3396	Type *DestTy = (VF == 1) ? CI->getType() :
3397	VectorType::get(CI->getType(), VF);
3398
3399	VectorParts &A = getVectorValue(it->getOperand(0));
3400	for (unsigned Part = 0; Part < UF; ++Part)
3401	Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
3402	propagateMetadata(Entry, it);
3403	break;
3404	}
3405
3406	case Instruction::Call: {
3407	// Ignore dbg intrinsics.
3408	if (isa<DbgInfoIntrinsic>(it))
3409	break;
3410	setDebugLocFromInst(Builder, it);
3411
3412	Module *M = BB->getParent()->getParent();
3413	CallInst *CI = cast<CallInst>(it);
3414	Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
3415	assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3415, __PRETTY_FUNCTION__));
3416	switch (ID) {
3417	case Intrinsic::assume:
3418	case Intrinsic::lifetime_end:
3419	case Intrinsic::lifetime_start:
3420	scalarizeInstruction(it);
3421	break;
3422	default:
3423	bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
3424	for (unsigned Part = 0; Part < UF; ++Part) {
3425	SmallVector<Value *, 4> Args;
3426	for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
3427	if (HasScalarOpd && i == 1) {
3428	Args.push_back(CI->getArgOperand(i));
3429	continue;
3430	}
3431	VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
3432	Args.push_back(Arg[Part]);
3433	}
3434	Type *Tys[] = {CI->getType()};
3435	if (VF > 1)
3436	Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
3437
3438	Function *F = Intrinsic::getDeclaration(M, ID, Tys);
3439	Entry[Part] = Builder.CreateCall(F, Args);
3440	}
3441
3442	propagateMetadata(Entry, it);
3443	break;
3444	}
3445	break;
3446	}
3447
3448	default:
3449	// All other instructions are unsupported. Scalarize them.
3450	scalarizeInstruction(it);
3451	break;
3452	}// end of switch.
3453	}// end of for_each instr.
3454	}
3455
3456	void InnerLoopVectorizer::updateAnalysis() {
3457	// Forget the original basic block.
3458	SE->forgetLoop(OrigLoop);
3459
3460	// Update the dominator tree information.
3461	assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3462, __PRETTY_FUNCTION__))
3462	"Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3462, __PRETTY_FUNCTION__));
3463
3464	for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
3465	DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
3466	DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
3467
3468	// Due to if predication of stores we might create a sequence of "if(pred)
3469	// a[i] = ...; " blocks.
3470	for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
3471	if (i == 0)
3472	DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
3473	else if (isPredicatedBlock(i)) {
3474	DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
3475	} else {
3476	DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
3477	}
3478	}
3479
3480	DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
3481	DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
3482	DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
3483	DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
3484
3485	DEBUG(DT->verifyDomTree())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { DT->verifyDomTree(); } } while (0);
3486	}
3487
3488	/// \brief Check whether it is safe to if-convert this phi node.
3489	///
3490	/// Phi nodes with constant expressions that can trap are not safe to if
3491	/// convert.
3492	static bool canIfConvertPHINodes(BasicBlock *BB) {
3493	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3494	PHINode *Phi = dyn_cast<PHINode>(I);
3495	if (!Phi)
3496	return true;
3497	for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
3498	if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
3499	if (C->canTrap())
3500	return false;
3501	}
3502	return true;
3503	}
3504
3505	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
3506	if (!EnableIfConversion) {
3507	emitAnalysis(Report() << "if-conversion is disabled");
3508	return false;
3509	}
3510
3511	assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable")((TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable" ) ? static_cast<void> (0) : __assert_fail ("TheLoop->getNumBlocks() > 1 && \"Single block loops are vectorizable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3511, __PRETTY_FUNCTION__));
3512
3513	// A list of pointers that we can safely read and write to.
3514	SmallPtrSet<Value *, 8> SafePointes;
3515
3516	// Collect safe addresses.
3517	for (Loop::block_iterator BI = TheLoop->block_begin(),
3518	BE = TheLoop->block_end(); BI != BE; ++BI) {
3519	BasicBlock BB = BI;
3520
3521	if (blockNeedsPredication(BB))
3522	continue;
3523
3524	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3525	if (LoadInst *LI = dyn_cast<LoadInst>(I))
3526	SafePointes.insert(LI->getPointerOperand());
3527	else if (StoreInst *SI = dyn_cast<StoreInst>(I))
3528	SafePointes.insert(SI->getPointerOperand());
3529	}
3530	}
3531
3532	// Collect the blocks that need predication.
3533	BasicBlock *Header = TheLoop->getHeader();
3534	for (Loop::block_iterator BI = TheLoop->block_begin(),
3535	BE = TheLoop->block_end(); BI != BE; ++BI) {
3536	BasicBlock BB = BI;
3537
3538	// We don't support switch statements inside loops.
3539	if (!isa<BranchInst>(BB->getTerminator())) {
3540	emitAnalysis(Report(BB->getTerminator())
3541	<< "loop contains a switch statement");
3542	return false;
3543	}
3544
3545	// We must be able to predicate all blocks that need to be predicated.
3546	if (blockNeedsPredication(BB)) {
3547	if (!blockCanBePredicated(BB, SafePointes)) {
3548	emitAnalysis(Report(BB->getTerminator())
3549	<< "control flow cannot be substituted for a select");
3550	return false;
3551	}
3552	} else if (BB != Header && !canIfConvertPHINodes(BB)) {
3553	emitAnalysis(Report(BB->getTerminator())
3554	<< "control flow cannot be substituted for a select");
3555	return false;
3556	}
3557	}
3558
3559	// We can if-convert this loop.
3560	return true;
3561	}
3562
3563	bool LoopVectorizationLegality::canVectorize() {
3564	// We must have a loop in canonical form. Loops with indirectbr in them cannot
3565	// be canonicalized.
3566	if (!TheLoop->getLoopPreheader()) {
3567	emitAnalysis(
3568	Report() << "loop control flow is not understood by vectorizer");
3569	return false;
3570	}
3571
3572	// We can only vectorize innermost loops.
3573	if (TheLoop->getSubLoopsVector().size()) {
3574	emitAnalysis(Report() << "loop is not the innermost loop");
3575	return false;
3576	}
3577
3578	// We must have a single backedge.
3579	if (TheLoop->getNumBackEdges() != 1) {
3580	emitAnalysis(
3581	Report() << "loop control flow is not understood by vectorizer");
3582	return false;
3583	}
3584
3585	// We must have a single exiting block.
3586	if (!TheLoop->getExitingBlock()) {
3587	emitAnalysis(
3588	Report() << "loop control flow is not understood by vectorizer");
3589	return false;
3590	}
3591
3592	// We only handle bottom-tested loops, i.e. loop in which the condition is
3593	// checked at the end of each iteration. With that we can assume that all
3594	// instructions in the loop are executed the same number of times.
3595	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
3596	emitAnalysis(
3597	Report() << "loop control flow is not understood by vectorizer");
3598	return false;
3599	}
3600
3601	// We need to have a loop header.
3602	DEBUG(dbgs() << "LV: Found a loop: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0)
3603	TheLoop->getHeader()->getName() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0);
3604
3605	// Check if we can if-convert non-single-bb loops.
3606	unsigned NumBlocks = TheLoop->getNumBlocks();
3607	if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
3608	DEBUG(dbgs() << "LV: Can't if-convert the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't if-convert the loop.\n" ; } } while (0);
3609	return false;
3610	}
3611
3612	// ScalarEvolution needs to be able to find the exit count.
3613	const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
3614	if (ExitCount == SE->getCouldNotCompute()) {
3615	emitAnalysis(Report() << "could not determine number of loop iterations");
3616	DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: SCEV could not compute the loop exit count.\n" ; } } while (0);
3617	return false;
3618	}
3619
3620	// Check if we can vectorize the instructions and CFG in this loop.
3621	if (!canVectorizeInstrs()) {
3622	DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize the instructions or CFG\n" ; } } while (0);
3623	return false;
3624	}
3625
3626	// Go over each instruction and look at memory deps.
3627	if (!canVectorizeMemory()) {
3628	DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize due to memory conflicts\n" ; } } while (0);
3629	return false;
3630	}
3631
3632	// Collect all of the variables that remain uniform after vectorization.
3633	collectLoopUniforms();
3634
3635	DEBUG(dbgs() << "LV: We can vectorize this loop" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0)
3636	(PtrRtCheck.Need ? " (with a runtime bound check)" : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0)
3637	<<"!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0);
3638
3639	// Okay! We can vectorize. At this point we don't have any other mem analysis
3640	// which may limit our maximum vectorization factor, so just return true with
3641	// no restrictions.
3642	return true;
3643	}
3644
3645	static Type convertPointerToIntegerType(const DataLayout &DL, Type Ty) {
3646	if (Ty->isPointerTy())
3647	return DL.getIntPtrType(Ty);
3648
3649	// It is possible that char's or short's overflow when we ask for the loop's
3650	// trip count, work around this by changing the type size.
3651	if (Ty->getScalarSizeInBits() < 32)
3652	return Type::getInt32Ty(Ty->getContext());
3653
3654	return Ty;
3655	}
3656
3657	static Type* getWiderType(const DataLayout &DL, Type Ty0, Type Ty1) {
3658	Ty0 = convertPointerToIntegerType(DL, Ty0);
3659	Ty1 = convertPointerToIntegerType(DL, Ty1);
3660	if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
3661	return Ty0;
3662	return Ty1;
3663	}
3664
3665	/// \brief Check that the instruction has outside loop users and is not an
3666	/// identified reduction variable.
3667	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
3668	SmallPtrSetImpl<Value *> &Reductions) {
3669	// Reduction instructions are allowed to have exit users. All other
3670	// instructions must not have external users.
3671	if (!Reductions.count(Inst))
3672	//Check that all of the users of the loop are inside the BB.
3673	for (User *U : Inst->users()) {
3674	Instruction *UI = cast<Instruction>(U);
3675	// This user may be a reduction exit value.
3676	if (!TheLoop->contains(UI)) {
3677	DEBUG(dbgs() << "LV: Found an outside user for : " << UI << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an outside user for : " << UI << '\n'; } } while (0);
3678	return true;
3679	}
3680	}
3681	return false;
3682	}
3683
3684	bool LoopVectorizationLegality::canVectorizeInstrs() {
3685	BasicBlock *PreHeader = TheLoop->getLoopPreheader();
3686	BasicBlock *Header = TheLoop->getHeader();
3687
3688	// Look for the attribute signaling the absence of NaNs.
3689	Function &F = *Header->getParent();
3690	if (F.hasFnAttribute("no-nans-fp-math"))
3691	HasFunNoNaNAttr = F.getAttributes().getAttribute(
3692	AttributeSet::FunctionIndex,
3693	"no-nans-fp-math").getValueAsString() == "true";
3694
3695	// For each block in the loop.
3696	for (Loop::block_iterator bb = TheLoop->block_begin(),
3697	be = TheLoop->block_end(); bb != be; ++bb) {
3698
3699	// Scan the instructions in the block and look for hazards.
3700	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
3701	++it) {
3702
3703	if (PHINode *Phi = dyn_cast<PHINode>(it)) {
3704	Type *PhiTy = Phi->getType();
3705	// Check that this PHI type is allowed.
3706	if (!PhiTy->isIntegerTy() &&
3707	!PhiTy->isFloatingPointTy() &&
3708	!PhiTy->isPointerTy()) {
3709	emitAnalysis(Report(it)
3710	<< "loop control flow is not understood by vectorizer");
3711	DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an non-int non-pointer PHI.\n" ; } } while (0);
3712	return false;
3713	}
3714
3715	// If this PHINode is not in the header block, then we know that we
3716	// can convert it to select during if-conversion. No need to check if
3717	// the PHIs in this block are induction or reduction variables.
3718	if (*bb != Header) {
3719	// Check that this instruction has no outside users or is an
3720	// identified reduction value with an outside user.
3721	if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
3722	continue;
3723	emitAnalysis(Report(it) << "value could not be identified as "
3724	"an induction or reduction variable");
3725	return false;
3726	}
3727
3728	// We only allow if-converted PHIs with more than two incoming values.
3729	if (Phi->getNumIncomingValues() != 2) {
3730	emitAnalysis(Report(it)
3731	<< "control flow not understood by vectorizer");
3732	DEBUG(dbgs() << "LV: Found an invalid PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an invalid PHI.\n" ; } } while (0);
3733	return false;
3734	}
3735
3736	// This is the value coming from the preheader.
3737	Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
3738	// Check if this is an induction variable.
3739	InductionKind IK = isInductionVariable(Phi);
3740
3741	if (IK_NoInduction != IK) {
3742	// Get the widest type.
3743	if (!WidestIndTy)
3744	WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
3745	else
3746	WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
3747
3748	// Int inductions are special because we only allow one IV.
3749	if (IK == IK_IntInduction) {
3750	// Use the phi node with the widest type as induction. Use the last
3751	// one if there are multiple (no good reason for doing this other
3752	// than it is expedient).
3753	if (!Induction \|\| PhiTy == WidestIndTy)
3754	Induction = Phi;
3755	}
3756
3757	DEBUG(dbgs() << "LV: Found an induction variable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an induction variable.\n" ; } } while (0);
3758	Inductions[Phi] = InductionInfo(StartValue, IK);
3759
3760	// Until we explicitly handle the case of an induction variable with
3761	// an outside loop user we have to give up vectorizing this loop.
3762	if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
3763	emitAnalysis(Report(it) << "use of induction value outside of the "
3764	"loop is not handled by vectorizer");
3765	return false;
3766	}
3767
3768	continue;
3769	}
3770
3771	if (AddReductionVar(Phi, RK_IntegerAdd)) {
3772	DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an ADD reduction PHI." << Phi <<"\n"; } } while (0);
3773	continue;
3774	}
3775	if (AddReductionVar(Phi, RK_IntegerMult)) {
3776	DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MUL reduction PHI." << Phi <<"\n"; } } while (0);
3777	continue;
3778	}
3779	if (AddReductionVar(Phi, RK_IntegerOr)) {
3780	DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an OR reduction PHI." << Phi <<"\n"; } } while (0);
3781	continue;
3782	}
3783	if (AddReductionVar(Phi, RK_IntegerAnd)) {
3784	DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an AND reduction PHI." << Phi <<"\n"; } } while (0);
3785	continue;
3786	}
3787	if (AddReductionVar(Phi, RK_IntegerXor)) {
3788	DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a XOR reduction PHI." << Phi <<"\n"; } } while (0);
3789	continue;
3790	}
3791	if (AddReductionVar(Phi, RK_IntegerMinMax)) {
3792	DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MINMAX reduction PHI." << Phi <<"\n"; } } while (0);
3793	continue;
3794	}
3795	if (AddReductionVar(Phi, RK_FloatMult)) {
3796	DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FMult reduction PHI." << Phi <<"\n"; } } while (0);
3797	continue;
3798	}
3799	if (AddReductionVar(Phi, RK_FloatAdd)) {
3800	DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FAdd reduction PHI." << Phi <<"\n"; } } while (0);
3801	continue;
3802	}
3803	if (AddReductionVar(Phi, RK_FloatMinMax)) {
3804	DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< Phi <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << Phi << "\n"; } } while (0)
3805	"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0);
3806	continue;
3807	}
3808
3809	emitAnalysis(Report(it) << "value that could not be identified as "
3810	"reduction is used outside the loop");
3811	DEBUG(dbgs() << "LV: Found an unidentified PHI."<< Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an unidentified PHI." << Phi <<"\n"; } } while (0);
3812	return false;
3813	}// end of PHI handling
3814
3815	// We still don't handle functions. However, we can ignore dbg intrinsic
3816	// calls and we do handle certain intrinsic and libm functions.
3817	CallInst *CI = dyn_cast<CallInst>(it);
3818	if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
3819	emitAnalysis(Report(it) << "call instruction cannot be vectorized");
3820	DEBUG(dbgs() << "LV: Found a call site.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a call site.\n" ; } } while (0);
3821	return false;
3822	}
3823
3824	// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
3825	// second argument is the same (i.e. loop invariant)
3826	if (CI &&
3827	hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
3828	if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
3829	emitAnalysis(Report(it)
3830	<< "intrinsic instruction cannot be vectorized");
3831	DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable intrinsic " << CI << "\n"; } } while (0);
3832	return false;
3833	}
3834	}
3835
3836	// Check that the instruction return type is vectorizable.
3837	// Also, we can't vectorize extractelement instructions.
3838	if ((!VectorType::isValidElementType(it->getType()) &&
3839	!it->getType()->isVoidTy()) \|\| isa<ExtractElementInst>(it)) {
3840	emitAnalysis(Report(it)
3841	<< "instruction return type cannot be vectorized");
3842	DEBUG(dbgs() << "LV: Found unvectorizable type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable type.\n" ; } } while (0);
3843	return false;
3844	}
3845
3846	// Check that the stored type is vectorizable.
3847	if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
3848	Type *T = ST->getValueOperand()->getType();
3849	if (!VectorType::isValidElementType(T)) {
3850	emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
3851	return false;
3852	}
3853	if (EnableMemAccessVersioning)
3854	collectStridedAcccess(ST);
3855	}
3856
3857	if (EnableMemAccessVersioning)
3858	if (LoadInst *LI = dyn_cast<LoadInst>(it))
3859	collectStridedAcccess(LI);
3860
3861	// Reduction instructions are allowed to have exit users.
3862	// All other instructions must not have external users.
3863	if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
3864	emitAnalysis(Report(it) << "value cannot be used outside the loop");
3865	return false;
3866	}
3867
3868	} // next instr.
3869
3870	}
3871
3872	if (!Induction) {
3873	DEBUG(dbgs() << "LV: Did not find one integer induction var.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Did not find one integer induction var.\n" ; } } while (0);
3874	if (Inductions.empty()) {
3875	emitAnalysis(Report()
3876	<< "loop induction variable could not be identified");
3877	return false;
3878	}
3879	}
3880
3881	return true;
3882	}
3883
3884	///\brief Remove GEPs whose indices but the last one are loop invariant and
3885	/// return the induction operand of the gep pointer.
3886	static Value stripGetElementPtr(Value Ptr, ScalarEvolution *SE,
3887	const DataLayout DL, Loop Lp) {
3888	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
3889	if (!GEP)
3890	return Ptr;
3891
3892	unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
3893
3894	// Check that all of the gep indices are uniform except for our induction
3895	// operand.
3896	for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
3897	if (i != InductionOperand &&
3898	!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
3899	return Ptr;
3900	return GEP->getOperand(InductionOperand);
3901	}
3902
3903	///\brief Look for a cast use of the passed value.
3904	static Value getUniqueCastUse(Value Ptr, Loop Lp, Type Ty) {
3905	Value *UniqueCast = nullptr;
3906	for (User *U : Ptr->users()) {
3907	CastInst *CI = dyn_cast<CastInst>(U);
3908	if (CI && CI->getType() == Ty) {
3909	if (!UniqueCast)
3910	UniqueCast = CI;
3911	else
3912	return nullptr;
3913	}
3914	}
3915	return UniqueCast;
3916	}
3917
3918	///\brief Get the stride of a pointer access in a loop.
3919	/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
3920	/// pointer to the Value, or null otherwise.
3921	static Value getStrideFromPointer(Value Ptr, ScalarEvolution *SE,
3922	const DataLayout DL, Loop Lp) {
3923	const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
3924	if (!PtrTy \|\| PtrTy->isAggregateType())
3925	return nullptr;
3926
3927	// Try to remove a gep instruction to make the pointer (actually index at this
3928	// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
3929	// pointer, otherwise, we are analyzing the index.
3930	Value *OrigPtr = Ptr;
3931
3932	// The size of the pointer access.
3933	int64_t PtrAccessSize = 1;
3934
3935	Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
3936	const SCEV *V = SE->getSCEV(Ptr);
3937
3938	if (Ptr != OrigPtr)
3939	// Strip off casts.
3940	while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
3941	V = C->getOperand();
3942
3943	const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
3944	if (!S)
3945	return nullptr;
3946
3947	V = S->getStepRecurrence(*SE);
3948	if (!V)
3949	return nullptr;
3950
3951	// Strip off the size of access multiplication if we are still analyzing the
3952	// pointer.
3953	if (OrigPtr == Ptr) {
3954	DL->getTypeAllocSize(PtrTy->getElementType());
3955	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
3956	if (M->getOperand(0)->getSCEVType() != scConstant)
3957	return nullptr;
3958
3959	const APInt &APStepVal =
3960	cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
3961
3962	// Huge step value - give up.
3963	if (APStepVal.getBitWidth() > 64)
3964	return nullptr;
3965
3966	int64_t StepVal = APStepVal.getSExtValue();
3967	if (PtrAccessSize != StepVal)
3968	return nullptr;
3969	V = M->getOperand(1);
3970	}
3971	}
3972
3973	// Strip off casts.
3974	Type *StripedOffRecurrenceCast = nullptr;
3975	if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
3976	StripedOffRecurrenceCast = C->getType();
3977	V = C->getOperand();
3978	}
3979
3980	// Look for the loop invariant symbolic value.
3981	const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
3982	if (!U)
3983	return nullptr;
3984
3985	Value *Stride = U->getValue();
3986	if (!Lp->isLoopInvariant(Stride))
3987	return nullptr;
3988
3989	// If we have stripped off the recurrence cast we have to make sure that we
3990	// return the value that is used in this loop so that we can replace it later.
3991	if (StripedOffRecurrenceCast)
3992	Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
3993
3994	return Stride;
3995	}
3996
3997	void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
3998	Value *Ptr = nullptr;
3999	if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
4000	Ptr = LI->getPointerOperand();
4001	else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
4002	Ptr = SI->getPointerOperand();
4003	else
4004	return;
4005
4006	Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
4007	if (!Stride)
4008	return;
4009
4010	DEBUG(dbgs() << "LV: Found a strided access that we can version")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a strided access that we can version" ; } } while (0);
4011	DEBUG(dbgs() << " Ptr: " << Ptr << " Stride: " << Stride << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " Ptr: " << Ptr << " Stride: " << Stride << "\n"; } } while (0);
4012	Strides[Ptr] = Stride;
4013	StrideSet.insert(Stride);
4014	}
4015
4016	void LoopVectorizationLegality::collectLoopUniforms() {
4017	// We now know that the loop is vectorizable!
4018	// Collect variables that will remain uniform after vectorization.
4019	std::vector<Value*> Worklist;
4020	BasicBlock *Latch = TheLoop->getLoopLatch();
4021
4022	// Start with the conditional branch and walk up the block.
4023	Worklist.push_back(Latch->getTerminator()->getOperand(0));
4024
4025	// Also add all consecutive pointer values; these values will be uniform
4026	// after vectorization (and subsequent cleanup) and, until revectorization is
4027	// supported, all dependencies must also be uniform.
4028	for (Loop::block_iterator B = TheLoop->block_begin(),
4029	BE = TheLoop->block_end(); B != BE; ++B)
4030	for (BasicBlock::iterator I = (B)->begin(), IE = (B)->end();
4031	I != IE; ++I)
4032	if (I->getType()->isPointerTy() && isConsecutivePtr(I))
4033	Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
4034
4035	while (Worklist.size()) {
4036	Instruction *I = dyn_cast<Instruction>(Worklist.back());
4037	Worklist.pop_back();
4038
4039	// Look at instructions inside this loop.
4040	// Stop when reaching PHI nodes.
4041	// TODO: we need to follow values all over the loop, not only in this block.
4042	if (!I \|\| !TheLoop->contains(I) \|\| isa<PHINode>(I))
4043	continue;
4044
4045	// This is a known uniform.
4046	Uniforms.insert(I);
4047
4048	// Insert all operands.
4049	Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
4050	}
4051	}
4052
4053	namespace {
4054	/// \brief Analyses memory accesses in a loop.
4055	///
4056	/// Checks whether run time pointer checks are needed and builds sets for data
4057	/// dependence checking.
4058	class AccessAnalysis {
4059	public:
4060	/// \brief Read or write access location.
4061	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
4062	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
4063
4064	/// \brief Set of potential dependent memory accesses.
4065	typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
4066
4067	AccessAnalysis(const DataLayout Dl, AliasAnalysis AA, DepCandidates &DA) :
4068	DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
4069
4070	/// \brief Register a load and whether it is only read from.
4071	void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
4072	Value Ptr = const_cast<Value>(Loc.Ptr);
4073	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
4074	Accesses.insert(MemAccessInfo(Ptr, false));
4075	if (IsReadOnly)
4076	ReadOnlyPtr.insert(Ptr);
4077	}
4078
4079	/// \brief Register a store.
4080	void addStore(AliasAnalysis::Location &Loc) {
4081	Value Ptr = const_cast<Value>(Loc.Ptr);
4082	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
4083	Accesses.insert(MemAccessInfo(Ptr, true));
4084	}
4085
4086	/// \brief Check whether we can check the pointers at runtime for
4087	/// non-intersection.
4088	bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
4089	unsigned &NumComparisons, ScalarEvolution *SE,
4090	Loop *TheLoop, ValueToValueMap &Strides,
4091	bool ShouldCheckStride = false);
4092
4093	/// \brief Goes over all memory accesses, checks whether a RT check is needed
4094	/// and builds sets of dependent accesses.
4095	void buildDependenceSets() {
4096	processMemAccesses();
4097	}
4098
4099	bool isRTCheckNeeded() { return IsRTCheckNeeded; }
4100
4101	bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
4102	void resetDepChecks() { CheckDeps.clear(); }
4103
4104	MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
4105
4106	private:
4107	typedef SetVector<MemAccessInfo> PtrAccessSet;
4108
4109	/// \brief Go over all memory access and check whether runtime pointer checks
4110	/// are needed /// and build sets of dependency check candidates.
4111	void processMemAccesses();
4112
4113	/// Set of all accesses.
4114	PtrAccessSet Accesses;
4115
4116	/// Set of accesses that need a further dependence check.
4117	MemAccessInfoSet CheckDeps;
4118
4119	/// Set of pointers that are read only.
4120	SmallPtrSet<Value*, 16> ReadOnlyPtr;
4121
4122	const DataLayout *DL;
4123
4124	/// An alias set tracker to partition the access set by underlying object and
4125	//intrinsic property (such as TBAA metadata).
4126	AliasSetTracker AST;
4127
4128	/// Sets of potentially dependent accesses - members of one set share an
4129	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
4130	/// dependence check.
4131	DepCandidates &DepCands;
4132
4133	bool IsRTCheckNeeded;
4134	};
4135
4136	} // end anonymous namespace
4137
4138	/// \brief Check whether a pointer can participate in a runtime bounds check.
4139	static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
4140	Value *Ptr) {
4141	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
4142	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
4143	if (!AR)
4144	return false;
4145
4146	return AR->isAffine();
4147	}
4148
4149	/// \brief Check the stride of the pointer and ensure that it does not wrap in
4150	/// the address space.
4151	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
4152	const Loop *Lp, ValueToValueMap &StridesMap);
4153
4154	bool AccessAnalysis::canCheckPtrAtRT(
4155	LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
4156	unsigned &NumComparisons, ScalarEvolution SE, Loop TheLoop,
4157	ValueToValueMap &StridesMap, bool ShouldCheckStride) {
4158	// Find pointers with computable bounds. We are going to use this information
4159	// to place a runtime bound check.
4160	bool CanDoRT = true;
4161
4162	bool IsDepCheckNeeded = isDependencyCheckNeeded();
4163	NumComparisons = 0;
4164
4165	// We assign a consecutive id to access from different alias sets.
4166	// Accesses between different groups doesn't need to be checked.
4167	unsigned ASId = 1;
4168	for (auto &AS : AST) {
4169	unsigned NumReadPtrChecks = 0;
4170	unsigned NumWritePtrChecks = 0;
4171
4172	// We assign consecutive id to access from different dependence sets.
4173	// Accesses within the same set don't need a runtime check.
4174	unsigned RunningDepId = 1;
4175	DenseMap<Value *, unsigned> DepSetId;
4176
4177	for (auto A : AS) {
4178	Value *Ptr = A.getValue();
4179	bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
4180	MemAccessInfo Access(Ptr, IsWrite);
4181
4182	if (IsWrite)
4183	++NumWritePtrChecks;
4184	else
4185	++NumReadPtrChecks;
4186
4187	if (hasComputableBounds(SE, StridesMap, Ptr) &&
4188	// When we run after a failing dependency check we have to make sure we
4189	// don't have wrapping pointers.
4190	(!ShouldCheckStride \|\|
4191	isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
4192	// The id of the dependence set.
4193	unsigned DepId;
4194
4195	if (IsDepCheckNeeded) {
4196	Value *Leader = DepCands.getLeaderValue(Access).getPointer();
4197	unsigned &LeaderId = DepSetId[Leader];
4198	if (!LeaderId)
4199	LeaderId = RunningDepId++;
4200	DepId = LeaderId;
4201	} else
4202	// Each access has its own dependence set.
4203	DepId = RunningDepId++;
4204
4205	RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
4206
4207	DEBUG(dbgs() << "LV: Found a runtime check ptr:" << Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a runtime check ptr:" << Ptr << '\n'; } } while (0);
4208	} else {
4209	CanDoRT = false;
4210	}
4211	}
4212
4213	if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
4214	NumComparisons += 0; // Only one dependence set.
4215	else {
4216	NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
4217	NumWritePtrChecks - 1));
4218	}
4219
4220	++ASId;
4221	}
4222
4223	// If the pointers that we would use for the bounds comparison have different
4224	// address spaces, assume the values aren't directly comparable, so we can't
4225	// use them for the runtime check. We also have to assume they could
4226	// overlap. In the future there should be metadata for whether address spaces
4227	// are disjoint.
4228	unsigned NumPointers = RtCheck.Pointers.size();
4229	for (unsigned i = 0; i < NumPointers; ++i) {
4230	for (unsigned j = i + 1; j < NumPointers; ++j) {
4231	// Only need to check pointers between two different dependency sets.
4232	if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
4233	continue;
4234	// Only need to check pointers in the same alias set.
4235	if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
4236	continue;
4237
4238	Value *PtrI = RtCheck.Pointers[i];
4239	Value *PtrJ = RtCheck.Pointers[j];
4240
4241	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
4242	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
4243	if (ASi != ASj) {
4244	DEBUG(dbgs() << "LV: Runtime check would require comparison between"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0)
4245	" different address spaces\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0);
4246	return false;
4247	}
4248	}
4249	}
4250
4251	return CanDoRT;
4252	}
4253
4254	void AccessAnalysis::processMemAccesses() {
4255	// We process the set twice: first we process read-write pointers, last we
4256	// process read-only pointers. This allows us to skip dependence tests for
4257	// read-only pointers.
4258
4259	DEBUG(dbgs() << "LV: Processing memory accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Processing memory accesses...\n" ; } } while (0);
4260	DEBUG(dbgs() << " AST: "; AST.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " AST: "; AST.dump(); } } while (0);
4261	DEBUG(dbgs() << "LV: Accesses:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Accesses:\n"; } } while (0);
4262	DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4263	for (auto A : Accesses)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4264	dbgs() << "\t" << A.getPointer() << " (" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4265	(A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4266	"read-only" : "read")) << ")\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0)
4267	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0);
4268
4269	// The AliasSetTracker has nicely partitioned our pointers by metadata
4270	// compatibility and potential for underlying-object overlap. As a result, we
4271	// only need to check for potential pointer dependencies within each alias
4272	// set.
4273	for (auto &AS : AST) {
4274	// Note that both the alias-set tracker and the alias sets themselves used
4275	// linked lists internally and so the iteration order here is deterministic
4276	// (matching the original instruction order within each set).
4277
4278	bool SetHasWrite = false;
4279
4280	// Map of pointers to last access encountered.
4281	typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
4282	UnderlyingObjToAccessMap ObjToLastAccess;
4283
4284	// Set of access to check after all writes have been processed.
4285	PtrAccessSet DeferredAccesses;
4286
4287	// Iterate over each alias set twice, once to process read/write pointers,
4288	// and then to process read-only pointers.
4289	for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
4290	bool UseDeferred = SetIteration > 0;
4291	PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
4292
4293	for (auto A : AS) {
4294	Value *Ptr = A.getValue();
4295	bool IsWrite = S.count(MemAccessInfo(Ptr, true));
4296
4297	// If we're using the deferred access set, then it contains only reads.
4298	bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
4299	if (UseDeferred && !IsReadOnlyPtr)
4300	continue;
4301	// Otherwise, the pointer must be in the PtrAccessSet, either as a read
4302	// or a write.
4303	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__))
4304	S.count(MemAccessInfo(Ptr, false))) &&((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__))
4305	"Alias-set pointer not in the access set?")((((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\| S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__));
4306
4307	MemAccessInfo Access(Ptr, IsWrite);
4308	DepCands.insert(Access);
4309
4310	// Memorize read-only pointers for later processing and skip them in the
4311	// first round (they need to be checked after we have seen all write
4312	// pointers). Note: we also mark pointer that are not consecutive as
4313	// "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
4314	// the second check for "!IsWrite".
4315	if (!UseDeferred && IsReadOnlyPtr) {
4316	DeferredAccesses.insert(Access);
4317	continue;
4318	}
4319
4320	// If this is a write - check other reads and writes for conflicts. If
4321	// this is a read only check other writes for conflicts (but only if
4322	// there is no other write to the ptr - this is an optimization to
4323	// catch "a[i] = a[i] + " without having to do a dependence check).
4324	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
4325	CheckDeps.insert(Access);
4326	IsRTCheckNeeded = true;
4327	}
4328
4329	if (IsWrite)
4330	SetHasWrite = true;
4331
4332	// Create sets of pointers connected by a shared alias set and
4333	// underlying object.
4334	typedef SmallVector<Value *, 16> ValueVector;
4335	ValueVector TempObjects;
4336	GetUnderlyingObjects(Ptr, TempObjects, DL);
4337	for (Value *UnderlyingObj : TempObjects) {
4338	UnderlyingObjToAccessMap::iterator Prev =
4339	ObjToLastAccess.find(UnderlyingObj);
4340	if (Prev != ObjToLastAccess.end())
4341	DepCands.unionSets(Access, Prev->second);
4342
4343	ObjToLastAccess[UnderlyingObj] = Access;
4344	}
4345	}
4346	}
4347	}
4348	}
4349
4350	namespace {
4351	/// \brief Checks memory dependences among accesses to the same underlying
4352	/// object to determine whether there vectorization is legal or not (and at
4353	/// which vectorization factor).
4354	///
4355	/// This class works under the assumption that we already checked that memory
4356	/// locations with different underlying pointers are "must-not alias".
4357	/// We use the ScalarEvolution framework to symbolically evalutate access
4358	/// functions pairs. Since we currently don't restructure the loop we can rely
4359	/// on the program order of memory accesses to determine their safety.
4360	/// At the moment we will only deem accesses as safe for:
4361	/// * A negative constant distance assuming program order.
4362	///
4363	/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
4364	/// a[i] = tmp; y = a[i];
4365	///
4366	/// The latter case is safe because later checks guarantuee that there can't
4367	/// be a cycle through a phi node (that is, we check that "x" and "y" is not
4368	/// the same variable: a header phi can only be an induction or a reduction, a
4369	/// reduction can't have a memory sink, an induction can't have a memory
4370	/// source). This is important and must not be violated (or we have to
4371	/// resort to checking for cycles through memory).
4372	///
4373	/// * A positive constant distance assuming program order that is bigger
4374	/// than the biggest memory access.
4375	///
4376	/// tmp = a[i] OR b[i] = x
4377	/// a[i+2] = tmp y = b[i+2];
4378	///
4379	/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
4380	///
4381	/// * Zero distances and all accesses have the same size.
4382	///
4383	class MemoryDepChecker {
4384	public:
4385	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
4386	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
4387
4388	MemoryDepChecker(ScalarEvolution Se, const DataLayout Dl, const Loop *L)
4389	: SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
4390	ShouldRetryWithRuntimeCheck(false) {}
4391
4392	/// \brief Register the location (instructions are given increasing numbers)
4393	/// of a write access.
4394	void addAccess(StoreInst *SI) {
4395	Value *Ptr = SI->getPointerOperand();
4396	Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
4397	InstMap.push_back(SI);
4398	++AccessIdx;
4399	}
4400
4401	/// \brief Register the location (instructions are given increasing numbers)
4402	/// of a write access.
4403	void addAccess(LoadInst *LI) {
4404	Value *Ptr = LI->getPointerOperand();
4405	Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
4406	InstMap.push_back(LI);
4407	++AccessIdx;
4408	}
4409
4410	/// \brief Check whether the dependencies between the accesses are safe.
4411	///
4412	/// Only checks sets with elements in \p CheckDeps.
4413	bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
4414	MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
4415
4416	/// \brief The maximum number of bytes of a vector register we can vectorize
4417	/// the accesses safely with.
4418	unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
4419
4420	/// \brief In same cases when the dependency check fails we can still
4421	/// vectorize the loop with a dynamic array access check.
4422	bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
4423
4424	private:
4425	ScalarEvolution *SE;
4426	const DataLayout *DL;
4427	const Loop *InnermostLoop;
4428
4429	/// \brief Maps access locations (ptr, read/write) to program order.
4430	DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
4431
4432	/// \brief Memory access instructions in program order.
4433	SmallVector<Instruction *, 16> InstMap;
4434
4435	/// \brief The program order index to be used for the next instruction.
4436	unsigned AccessIdx;
4437
4438	// We can access this many bytes in parallel safely.
4439	unsigned MaxSafeDepDistBytes;
4440
4441	/// \brief If we see a non-constant dependence distance we can still try to
4442	/// vectorize this loop with runtime checks.
4443	bool ShouldRetryWithRuntimeCheck;
4444
4445	/// \brief Check whether there is a plausible dependence between the two
4446	/// accesses.
4447	///
4448	/// Access \p A must happen before \p B in program order. The two indices
4449	/// identify the index into the program order map.
4450	///
4451	/// This function checks whether there is a plausible dependence (or the
4452	/// absence of such can't be proved) between the two accesses. If there is a
4453	/// plausible dependence but the dependence distance is bigger than one
4454	/// element access it records this distance in \p MaxSafeDepDistBytes (if this
4455	/// distance is smaller than any other distance encountered so far).
4456	/// Otherwise, this function returns true signaling a possible dependence.
4457	bool isDependent(const MemAccessInfo &A, unsigned AIdx,
4458	const MemAccessInfo &B, unsigned BIdx,
4459	ValueToValueMap &Strides);
4460
4461	/// \brief Check whether the data dependence could prevent store-load
4462	/// forwarding.
4463	bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
4464	};
4465
4466	} // end anonymous namespace
4467
4468	static bool isInBoundsGep(Value *Ptr) {
4469	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
4470	return GEP->isInBounds();
4471	return false;
4472	}
4473
4474	/// \brief Check whether the access through \p Ptr has a constant stride.
4475	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
4476	const Loop *Lp, ValueToValueMap &StridesMap) {
4477	const Type *Ty = Ptr->getType();
4478	assert(Ty->isPointerTy() && "Unexpected non-ptr")((Ty->isPointerTy() && "Unexpected non-ptr") ? static_cast <void> (0) : __assert_fail ("Ty->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4478, __PRETTY_FUNCTION__));
4479
4480	// Make sure that the pointer does not point to aggregate types.
4481	const PointerType *PtrTy = cast<PointerType>(Ty);
4482	if (PtrTy->getElementType()->isAggregateType()) {
4483	DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << Ptr << "\n"; } } while (0)
4484	"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0);
4485	return 0;
4486	}
4487
4488	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
4489
4490	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
4491	if (!AR) {
4492	DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4493	<< Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4494	return 0;
4495	}
4496
4497	// The accesss function must stride over the innermost loop.
4498	if (Lp != AR->getLoop()) {
4499	DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4500	Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4501	}
4502
4503	// The address calculation must not wrap. Otherwise, a dependence could be
4504	// inverted.
4505	// An inbounds getelementptr that is a AddRec with a unit stride
4506	// cannot wrap per definition. The unit stride requirement is checked later.
4507	// An getelementptr without an inbounds attribute and unit stride would have
4508	// to access the pointer value "0" which is undefined behavior in address
4509	// space 0, therefore we can also vectorize this case.
4510	bool IsInBoundsGEP = isInBoundsGep(Ptr);
4511	bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
4512	bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
4513	if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
4514	DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0)
4515	<< Ptr << " SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << Ptr << " SCEV: " << PtrScev << "\n" ; } } while (0);
4516	return 0;
4517	}
4518
4519	// Check the step is constant.
4520	const SCEV Step = AR->getStepRecurrence(SE);
4521
4522	// Calculate the pointer stride and check if it is consecutive.
4523	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4524	if (!C) {
4525	DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0)
4526	" SCEV: " << PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0);
4527	return 0;
4528	}
4529
4530	int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
4531	const APInt &APStepVal = C->getValue()->getValue();
4532
4533	// Huge step value - give up.
4534	if (APStepVal.getBitWidth() > 64)
4535	return 0;
4536
4537	int64_t StepVal = APStepVal.getSExtValue();
4538
4539	// Strided access.
4540	int64_t Stride = StepVal / Size;
4541	int64_t Rem = StepVal % Size;
4542	if (Rem)
4543	return 0;
4544
4545	// If the SCEV could wrap but we have an inbounds gep with a unit stride we
4546	// know we can't "wrap around the address space". In case of address space
4547	// zero we know that this won't happen without triggering undefined behavior.
4548	if (!IsNoWrapAddRec && (IsInBoundsGEP \|\| IsInAddressSpaceZero) &&
4549	Stride != 1 && Stride != -1)
4550	return 0;
4551
4552	return Stride;
4553	}
4554
4555	bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
4556	unsigned TypeByteSize) {
4557	// If loads occur at a distance that is not a multiple of a feasible vector
4558	// factor store-load forwarding does not take place.
4559	// Positive dependences might cause troubles because vectorizing them might
4560	// prevent store-load forwarding making vectorized code run a lot slower.
4561	// a[i] = a[i-3] ^ a[i-8];
4562	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
4563	// hence on your typical architecture store-load forwarding does not take
4564	// place. Vectorizing in such cases does not make sense.
4565	// Store-load forwarding distance.
4566	const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
4567	// Maximum vector factor.
4568	unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
4569	if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
4570	MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
4571
4572	for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
4573	vf *= 2) {
4574	if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
4575	MaxVFWithoutSLForwardIssues = (vf >>=1);
4576	break;
4577	}
4578	}
4579
4580	if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
4581	DEBUG(dbgs() << "LV: Distance " << Distance <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0)
4582	" that could cause a store-load forwarding conflict\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0);
4583	return true;
4584	}
4585
4586	if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
4587	MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
4588	MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
4589	return false;
4590	}
4591
4592	bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
4593	const MemAccessInfo &B, unsigned BIdx,
4594	ValueToValueMap &Strides) {
4595	assert (AIdx < BIdx && "Must pass arguments in program order")((AIdx < BIdx && "Must pass arguments in program order" ) ? static_cast<void> (0) : __assert_fail ("AIdx < BIdx && \"Must pass arguments in program order\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4595, __PRETTY_FUNCTION__));
4596
4597	Value *APtr = A.getPointer();
4598	Value *BPtr = B.getPointer();
4599	bool AIsWrite = A.getInt();
4600	bool BIsWrite = B.getInt();
4601
4602	// Two reads are independent.
4603	if (!AIsWrite && !BIsWrite)
4604	return false;
4605
4606	// We cannot check pointers in different address spaces.
4607	if (APtr->getType()->getPointerAddressSpace() !=
4608	BPtr->getType()->getPointerAddressSpace())
4609	return true;
4610
4611	const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
4612	const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
4613
4614	int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
4615	int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
4616
4617	const SCEV *Src = AScev;
4618	const SCEV *Sink = BScev;
4619
4620	// If the induction step is negative we have to invert source and sink of the
4621	// dependence.
4622	if (StrideAPtr < 0) {
4623	//Src = BScev;
4624	//Sink = AScev;
4625	std::swap(APtr, BPtr);
4626	std::swap(Src, Sink);
4627	std::swap(AIsWrite, BIsWrite);
4628	std::swap(AIdx, BIdx);
4629	std::swap(StrideAPtr, StrideBPtr);
4630	}
4631
4632	const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
4633
4634	DEBUG(dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sinkdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0)
4635	<< "(Induction step: " << StrideAPtr << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0);
4636	DEBUG(dbgs() << "LV: Distance for " << InstMap[AIdx] << " to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << InstMap[AIdx] << " to " << InstMap[BIdx] << ": " << Dist << "\n"; } } while (0)
4637	<< InstMap[BIdx] << ": " << Dist << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << InstMap[AIdx] << " to " << InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0);
4638
4639	// Need consecutive accesses. We don't want to vectorize
4640	// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
4641	// the address space.
4642	if (!StrideAPtr \|\| !StrideBPtr \|\| StrideAPtr != StrideBPtr){
4643	DEBUG(dbgs() << "Non-consecutive pointer access\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Non-consecutive pointer access\n" ; } } while (0);
4644	return true;
4645	}
4646
4647	const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
4648	if (!C) {
4649	DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence because of non-constant distance\n" ; } } while (0);
4650	ShouldRetryWithRuntimeCheck = true;
4651	return true;
4652	}
4653
4654	Type *ATy = APtr->getType()->getPointerElementType();
4655	Type *BTy = BPtr->getType()->getPointerElementType();
4656	unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
4657
4658	// Negative distances are not plausible dependencies.
4659	const APInt &Val = C->getValue()->getValue();
4660	if (Val.isNegative()) {
4661	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
4662	if (IsTrueDataDependence &&
4663	(couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) \|\|
4664	ATy != BTy))
4665	return true;
4666
4667	DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence is negative: NoDep\n" ; } } while (0);
4668	return false;
4669	}
4670
4671	// Write to the same location with the same size.
4672	// Could be improved to assert type sizes are the same (i32 == float, etc).
4673	if (Val == 0) {
4674	if (ATy == BTy)
4675	return false;
4676	DEBUG(dbgs() << "LV: Zero dependence difference but different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Zero dependence difference but different types\n" ; } } while (0);
4677	return true;
4678	}
4679
4680	assert(Val.isStrictlyPositive() && "Expect a positive value")((Val.isStrictlyPositive() && "Expect a positive value" ) ? static_cast<void> (0) : __assert_fail ("Val.isStrictlyPositive() && \"Expect a positive value\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4680, __PRETTY_FUNCTION__));
4681
4682	// Positive distance bigger than max vectorization factor.
4683	if (ATy != BTy) {
4684	DEBUG(dbgs() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0)
4685	"LV: ReadWrite-Write positive dependency with different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0);
4686	return false;
4687	}
4688
4689	unsigned Distance = (unsigned) Val.getZExtValue();
4690
4691	// Bail out early if passed-in parameters make vectorization not feasible.
4692	unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
4693	unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
4694
4695	// The distance must be bigger than the size needed for a vectorized version
4696	// of the operation and the size of the vectorized operation must not be
4697	// bigger than the currrent maximum size.
4698	if (Distance < 2*TypeByteSize \|\|
4699	2*TypeByteSize > MaxSafeDepDistBytes \|\|
4700	Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
4701	DEBUG(dbgs() << "LV: Failure because of Positive distance "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0)
4702	<< Val.getSExtValue() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0);
4703	return true;
4704	}
4705
4706	MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
4707	Distance : MaxSafeDepDistBytes;
4708
4709	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
4710	if (IsTrueDataDependence &&
4711	couldPreventStoreLoadForward(Distance, TypeByteSize))
4712	return true;
4713
4714	DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0)
4715	" with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0);
4716
4717	return false;
4718	}
4719
4720	bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
4721	MemAccessInfoSet &CheckDeps,
4722	ValueToValueMap &Strides) {
4723
4724	MaxSafeDepDistBytes = -1U;
4725	while (!CheckDeps.empty()) {
4726	MemAccessInfo CurAccess = *CheckDeps.begin();
4727
4728	// Get the relevant memory access set.
4729	EquivalenceClasses<MemAccessInfo>::iterator I =
4730	AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
4731
4732	// Check accesses within this set.
4733	EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
4734	AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
4735
4736	// Check every access pair.
4737	while (AI != AE) {
4738	CheckDeps.erase(*AI);
4739	EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
4740	while (OI != AE) {
4741	// Check every accessing instruction pair in program order.
4742	for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
4743	I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
4744	for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
4745	I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
4746	if (I1 < I2 && isDependent(AI, I1, OI, I2, Strides))
4747	return false;
4748	if (I2 < I1 && isDependent(OI, I2, AI, I1, Strides))
4749	return false;
4750	}
4751	++OI;
4752	}
4753	AI++;
4754	}
4755	}
4756	return true;
4757	}
4758
4759	bool LoopVectorizationLegality::canVectorizeMemory() {
4760
4761	typedef SmallVector<Value*, 16> ValueVector;
4762	typedef SmallPtrSet<Value*, 16> ValueSet;
4763
4764	// Holds the Load and Store instructions.
4765	ValueVector Loads;
4766	ValueVector Stores;
4767
4768	// Holds all the different accesses in the loop.
4769	unsigned NumReads = 0;
4770	unsigned NumReadWrites = 0;
4771
4772	PtrRtCheck.Pointers.clear();
4773	PtrRtCheck.Need = false;
4774
4775	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
4776	MemoryDepChecker DepChecker(SE, DL, TheLoop);
4777
4778	// For each block.
4779	for (Loop::block_iterator bb = TheLoop->block_begin(),
4780	be = TheLoop->block_end(); bb != be; ++bb) {
4781
4782	// Scan the BB and collect legal loads and stores.
4783	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
4784	++it) {
4785
4786	// If this is a load, save it. If this instruction can read from memory
4787	// but is not a load, then we quit. Notice that we don't handle function
4788	// calls that read or write.
4789	if (it->mayReadFromMemory()) {
4790	// Many math library functions read the rounding mode. We will only
4791	// vectorize a loop if it contains known function calls that don't set
4792	// the flag. Therefore, it is safe to ignore this read from memory.
4793	CallInst *Call = dyn_cast<CallInst>(it);
4794	if (Call && getIntrinsicIDForCall(Call, TLI))
4795	continue;
4796
4797	LoadInst *Ld = dyn_cast<LoadInst>(it);
4798	if (!Ld \|\| (!Ld->isSimple() && !IsAnnotatedParallel)) {
4799	emitAnalysis(Report(Ld)
4800	<< "read with atomic ordering or volatile read");
4801	DEBUG(dbgs() << "LV: Found a non-simple load.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple load.\n" ; } } while (0);
4802	return false;
4803	}
4804	NumLoads++;
4805	Loads.push_back(Ld);
4806	DepChecker.addAccess(Ld);
4807	continue;
4808	}
4809
4810	// Save 'store' instructions. Abort if other instructions write to memory.
4811	if (it->mayWriteToMemory()) {
4812	StoreInst *St = dyn_cast<StoreInst>(it);
4813	if (!St) {
4814	emitAnalysis(Report(it) << "instruction cannot be vectorized");
4815	return false;
4816	}
4817	if (!St->isSimple() && !IsAnnotatedParallel) {
4818	emitAnalysis(Report(St)
4819	<< "write with atomic ordering or volatile write");
4820	DEBUG(dbgs() << "LV: Found a non-simple store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple store.\n" ; } } while (0);
4821	return false;
4822	}
4823	NumStores++;
4824	Stores.push_back(St);
4825	DepChecker.addAccess(St);
4826	}
4827	} // Next instr.
4828	} // Next block.
4829
4830	// Now we have two lists that hold the loads and the stores.
4831	// Next, we find the pointers that they use.
4832
4833	// Check if we see any stores. If there are no stores, then we don't
4834	// care if the pointers are restrict.
4835	if (!Stores.size()) {
4836	DEBUG(dbgs() << "LV: Found a read-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a read-only loop!\n" ; } } while (0);
4837	return true;
4838	}
4839
4840	AccessAnalysis::DepCandidates DependentAccesses;
4841	AccessAnalysis Accesses(DL, AA, DependentAccesses);
4842
4843	// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
4844	// multiple times on the same object. If the ptr is accessed twice, once
4845	// for read and once for write, it will only appear once (on the write
4846	// list). This is okay, since we are going to check for conflicts between
4847	// writes and between reads and writes, but not between reads and reads.
4848	ValueSet Seen;
4849
4850	ValueVector::iterator I, IE;
4851	for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
4852	StoreInst ST = cast<StoreInst>(I);
4853	Value* Ptr = ST->getPointerOperand();
4854
4855	if (isUniform(Ptr)) {
4856	emitAnalysis(
4857	Report(ST)
4858	<< "write to a loop invariant address could not be vectorized");
4859	DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We don't allow storing to uniform addresses\n" ; } } while (0);
4860	return false;
4861	}
4862
4863	// If we did not see this pointer before, insert it to the read-write
4864	// list. At this phase it is only a 'write' list.
4865	if (Seen.insert(Ptr).second) {
4866	++NumReadWrites;
4867
4868	AliasAnalysis::Location Loc = AA->getLocation(ST);
4869	// The TBAA metadata could have a control dependency on the predication
4870	// condition, so we cannot rely on it when determining whether or not we
4871	// need runtime pointer checks.
4872	if (blockNeedsPredication(ST->getParent()))
4873	Loc.AATags.TBAA = nullptr;
4874
4875	Accesses.addStore(Loc);
4876	}
4877	}
4878
4879	if (IsAnnotatedParallel) {
4880	DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0)
4881	<< "LV: A loop annotated parallel, ignore memory dependency "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0)
4882	<< "checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0);
4883	return true;
4884	}
4885
4886	for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
4887	LoadInst LD = cast<LoadInst>(I);
4888	Value* Ptr = LD->getPointerOperand();
4889	// If we did not see this pointer before, insert it to the
4890	// read list. If we did see it before, then it is already in
4891	// the read-write list. This allows us to vectorize expressions
4892	// such as A[i] += x; Because the address of A[i] is a read-write
4893	// pointer. This only works if the index of A[i] is consecutive.
4894	// If the address of i is unknown (for example A[B[i]]) then we may
4895	// read a few words, modify, and write a few words, and some of the
4896	// words may be written to the same address.
4897	bool IsReadOnlyPtr = false;
4898	if (Seen.insert(Ptr).second \|\|
4899	!isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
4900	++NumReads;
4901	IsReadOnlyPtr = true;
4902	}
4903
4904	AliasAnalysis::Location Loc = AA->getLocation(LD);
4905	// The TBAA metadata could have a control dependency on the predication
4906	// condition, so we cannot rely on it when determining whether or not we
4907	// need runtime pointer checks.
4908	if (blockNeedsPredication(LD->getParent()))
4909	Loc.AATags.TBAA = nullptr;
4910
4911	Accesses.addLoad(Loc, IsReadOnlyPtr);
4912	}
4913
4914	// If we write (or read-write) to a single destination and there are no
4915	// other reads in this loop then is it safe to vectorize.
4916	if (NumReadWrites == 1 && NumReads == 0) {
4917	DEBUG(dbgs() << "LV: Found a write-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a write-only loop!\n" ; } } while (0);
4918	return true;
4919	}
4920
4921	// Build dependence sets and check whether we need a runtime pointer bounds
4922	// check.
4923	Accesses.buildDependenceSets();
4924	bool NeedRTCheck = Accesses.isRTCheckNeeded();
4925
4926	// Find pointers with computable bounds. We are going to use this information
4927	// to place a runtime bound check.
4928	unsigned NumComparisons = 0;
4929	bool CanDoRT = false;
4930	if (NeedRTCheck)
4931	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
4932	Strides);
4933
4934	DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0)
4935	" pointer comparisons.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0);
4936
4937	// If we only have one set of dependences to check pointers among we don't
4938	// need a runtime check.
4939	if (NumComparisons == 0 && NeedRTCheck)
4940	NeedRTCheck = false;
4941
4942	// Check that we did not collect too many pointers or found an unsizeable
4943	// pointer.
4944	if (!CanDoRT \|\| NumComparisons > RuntimeMemoryCheckThreshold) {
4945	PtrRtCheck.reset();
4946	CanDoRT = false;
4947	}
4948
4949	if (CanDoRT) {
4950	DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can perform a memory runtime check if needed.\n" ; } } while (0);
4951	}
4952
4953	if (NeedRTCheck && !CanDoRT) {
4954	emitAnalysis(Report() << "cannot identify array bounds");
4955	DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0)
4956	"the array bounds.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0);
4957	PtrRtCheck.reset();
4958	return false;
4959	}
4960
4961	PtrRtCheck.Need = NeedRTCheck;
4962
4963	bool CanVecMem = true;
4964	if (Accesses.isDependencyCheckNeeded()) {
4965	DEBUG(dbgs() << "LV: Checking memory dependencies\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Checking memory dependencies\n" ; } } while (0);
4966	CanVecMem = DepChecker.areDepsSafe(
4967	DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
4968	MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
4969
4970	if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
4971	DEBUG(dbgs() << "LV: Retrying with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Retrying with memory checks\n" ; } } while (0);
4972	NeedRTCheck = true;
4973
4974	// Clear the dependency checks. We assume they are not needed.
4975	Accesses.resetDepChecks();
4976
4977	PtrRtCheck.reset();
4978	PtrRtCheck.Need = true;
4979
4980	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
4981	TheLoop, Strides, true);
4982	// Check that we did not collect too many pointers or found an unsizeable
4983	// pointer.
4984	if (!CanDoRT \|\| NumComparisons > RuntimeMemoryCheckThreshold) {
4985	if (!CanDoRT && NumComparisons > 0)
4986	emitAnalysis(Report()
4987	<< "cannot check memory dependencies at runtime");
4988	else
4989	emitAnalysis(Report()
4990	<< NumComparisons << " exceeds limit of "
4991	<< RuntimeMemoryCheckThreshold
4992	<< " dependent memory operations checked at runtime");
4993	DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize with memory checks\n" ; } } while (0);
4994	PtrRtCheck.reset();
4995	return false;
4996	}
4997
4998	CanVecMem = true;
4999	}
5000	}
5001
5002	if (!CanVecMem)
5003	emitAnalysis(Report() << "unsafe dependent memory operations in loop");
5004
5005	DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0)
5006	" need a runtime memory check.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0);
5007
5008	return CanVecMem;
5009	}
5010
5011	static bool hasMultipleUsesOf(Instruction *I,
5012	SmallPtrSetImpl<Instruction *> &Insts) {
5013	unsigned NumUses = 0;
5014	for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
5015	if (Insts.count(dyn_cast<Instruction>(*Use)))
5016	++NumUses;
5017	if (NumUses > 1)
5018	return true;
5019	}
5020
5021	return false;
5022	}
5023
5024	static bool areAllUsesIn(Instruction I, SmallPtrSetImpl<Instruction > &Set) {
5025	for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
5026	if (!Set.count(dyn_cast<Instruction>(*Use)))
5027	return false;
5028	return true;
5029	}
5030
5031	bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
5032	ReductionKind Kind) {
5033	if (Phi->getNumIncomingValues() != 2)
5034	return false;
5035
5036	// Reduction variables are only found in the loop header block.
5037	if (Phi->getParent() != TheLoop->getHeader())
5038	return false;
5039
5040	// Obtain the reduction start value from the value that comes from the loop
5041	// preheader.
5042	Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
5043
5044	// ExitInstruction is the single value which is used outside the loop.
5045	// We only allow for a single reduction value to be used outside the loop.
5046	// This includes users of the reduction, variables (which form a cycle
5047	// which ends in the phi node).
5048	Instruction *ExitInstruction = nullptr;
5049	// Indicates that we found a reduction operation in our scan.
5050	bool FoundReduxOp = false;
5051
5052	// We start with the PHI node and scan for all of the users of this
5053	// instruction. All users must be instructions that can be used as reduction
5054	// variables (such as ADD). We must have a single out-of-block user. The cycle
5055	// must include the original PHI.
5056	bool FoundStartPHI = false;
5057
5058	// To recognize min/max patterns formed by a icmp select sequence, we store
5059	// the number of instruction we saw from the recognized min/max pattern,
5060	// to make sure we only see exactly the two instructions.
5061	unsigned NumCmpSelectPatternInst = 0;
5062	ReductionInstDesc ReduxDesc(false, nullptr);
5063
5064	SmallPtrSet<Instruction *, 8> VisitedInsts;
5065	SmallVector<Instruction *, 8> Worklist;
5066	Worklist.push_back(Phi);
5067	VisitedInsts.insert(Phi);
5068
5069	// A value in the reduction can be used:
5070	// - By the reduction:
5071	// - Reduction operation:
5072	// - One use of reduction value (safe).
5073	// - Multiple use of reduction value (not safe).
5074	// - PHI:
5075	// - All uses of the PHI must be the reduction (safe).
5076	// - Otherwise, not safe.
5077	// - By one instruction outside of the loop (safe).
5078	// - By further instructions outside of the loop (not safe).
5079	// - By an instruction that is not part of the reduction (not safe).
5080	// This is either:
5081	// * An instruction type other than PHI or the reduction operation.
5082	// * A PHI in the header other than the initial PHI.
5083	while (!Worklist.empty()) {
5084	Instruction *Cur = Worklist.back();
5085	Worklist.pop_back();
5086
5087	// No Users.
5088	// If the instruction has no users then this is a broken chain and can't be
5089	// a reduction variable.
5090	if (Cur->use_empty())
5091	return false;
5092
5093	bool IsAPhi = isa<PHINode>(Cur);
5094
5095	// A header PHI use other than the original PHI.
5096	if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
5097	return false;
5098
5099	// Reductions of instructions such as Div, and Sub is only possible if the
5100	// LHS is the reduction variable.
5101	if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
5102	!isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
5103	!VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
5104	return false;
5105
5106	// Any reduction instruction must be of one of the allowed kinds.
5107	ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
5108	if (!ReduxDesc.IsReduction)
5109	return false;
5110
5111	// A reduction operation must only have one use of the reduction value.
5112	if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
5113	hasMultipleUsesOf(Cur, VisitedInsts))
5114	return false;
5115
5116	// All inputs to a PHI node must be a reduction value.
5117	if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
5118	return false;
5119
5120	if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) \|\|
5121	isa<SelectInst>(Cur)))
5122	++NumCmpSelectPatternInst;
5123	if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) \|\|
5124	isa<SelectInst>(Cur)))
5125	++NumCmpSelectPatternInst;
5126
5127	// Check whether we found a reduction operator.
5128	FoundReduxOp \|= !IsAPhi;
5129
5130	// Process users of current instruction. Push non-PHI nodes after PHI nodes
5131	// onto the stack. This way we are going to have seen all inputs to PHI
5132	// nodes once we get to them.
5133	SmallVector<Instruction *, 8> NonPHIs;
5134	SmallVector<Instruction *, 8> PHIs;
5135	for (User *U : Cur->users()) {
5136	Instruction *UI = cast<Instruction>(U);
5137
5138	// Check if we found the exit user.
5139	BasicBlock *Parent = UI->getParent();
5140	if (!TheLoop->contains(Parent)) {
5141	// Exit if you find multiple outside users or if the header phi node is
5142	// being used. In this case the user uses the value of the previous
5143	// iteration, in which case we would loose "VF-1" iterations of the
5144	// reduction operation if we vectorize.
5145	if (ExitInstruction != nullptr \|\| Cur == Phi)
5146	return false;
5147
5148	// The instruction used by an outside user must be the last instruction
5149	// before we feed back to the reduction phi. Otherwise, we loose VF-1
5150	// operations on the value.
5151	if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
5152	return false;
5153
5154	ExitInstruction = Cur;
5155	continue;
5156	}
5157
5158	// Process instructions only once (termination). Each reduction cycle
5159	// value must only be used once, except by phi nodes and min/max
5160	// reductions which are represented as a cmp followed by a select.
5161	ReductionInstDesc IgnoredVal(false, nullptr);
5162	if (VisitedInsts.insert(UI).second) {
5163	if (isa<PHINode>(UI))
5164	PHIs.push_back(UI);
5165	else
5166	NonPHIs.push_back(UI);
5167	} else if (!isa<PHINode>(UI) &&
5168	((!isa<FCmpInst>(UI) &&
5169	!isa<ICmpInst>(UI) &&
5170	!isa<SelectInst>(UI)) \|\|
5171	!isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
5172	return false;
5173
5174	// Remember that we completed the cycle.
5175	if (UI == Phi)
5176	FoundStartPHI = true;
5177	}
5178	Worklist.append(PHIs.begin(), PHIs.end());
5179	Worklist.append(NonPHIs.begin(), NonPHIs.end());
5180	}
5181
5182	// This means we have seen one but not the other instruction of the
5183	// pattern or more than just a select and cmp.
5184	if ((Kind == RK_IntegerMinMax \|\| Kind == RK_FloatMinMax) &&
5185	NumCmpSelectPatternInst != 2)
5186	return false;
5187
5188	if (!FoundStartPHI \|\| !FoundReduxOp \|\| !ExitInstruction)
5189	return false;
5190
5191	// We found a reduction var if we have reached the original phi node and we
5192	// only have a single instruction with out-of-loop users.
5193
5194	// This instruction is allowed to have out-of-loop users.
5195	AllowedExit.insert(ExitInstruction);
5196
5197	// Save the description of this reduction variable.
5198	ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
5199	ReduxDesc.MinMaxKind);
5200	Reductions[Phi] = RD;
5201	// We've ended the cycle. This is a reduction variable if we have an
5202	// outside user and it has a binary op.
5203
5204	return true;
5205	}
5206
5207	/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
5208	/// pattern corresponding to a min(X, Y) or max(X, Y).
5209	LoopVectorizationLegality::ReductionInstDesc
5210	LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
5211	ReductionInstDesc &Prev) {
5212
5213	assert((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) &&(((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5214, __PRETTY_FUNCTION__))
5214	"Expect a select instruction")(((isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) \|\| isa<FCmpInst>(I) \|\| isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5214, __PRETTY_FUNCTION__));
5215	Instruction *Cmp = nullptr;
5216	SelectInst *Select = nullptr;
5217
5218	// We must handle the select(cmp()) as a single instruction. Advance to the
5219	// select.
5220	if ((Cmp = dyn_cast<ICmpInst>(I)) \|\| (Cmp = dyn_cast<FCmpInst>(I))) {
5221	if (!Cmp->hasOneUse() \|\| !(Select = dyn_cast<SelectInst>(*I->user_begin())))
5222	return ReductionInstDesc(false, I);
5223	return ReductionInstDesc(Select, Prev.MinMaxKind);
5224	}
5225
5226	// Only handle single use cases for now.
5227	if (!(Select = dyn_cast<SelectInst>(I)))
5228	return ReductionInstDesc(false, I);
5229	if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
5230	!(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
5231	return ReductionInstDesc(false, I);
5232	if (!Cmp->hasOneUse())
5233	return ReductionInstDesc(false, I);
5234
5235	Value *CmpLeft;
5236	Value *CmpRight;
5237
5238	// Look for a min/max pattern.
5239	if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5240	return ReductionInstDesc(Select, MRK_UIntMin);
5241	else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5242	return ReductionInstDesc(Select, MRK_UIntMax);
5243	else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5244	return ReductionInstDesc(Select, MRK_SIntMax);
5245	else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5246	return ReductionInstDesc(Select, MRK_SIntMin);
5247	else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5248	return ReductionInstDesc(Select, MRK_FloatMin);
5249	else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5250	return ReductionInstDesc(Select, MRK_FloatMax);
5251	else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5252	return ReductionInstDesc(Select, MRK_FloatMin);
5253	else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
5254	return ReductionInstDesc(Select, MRK_FloatMax);
5255
5256	return ReductionInstDesc(false, I);
5257	}
5258
5259	LoopVectorizationLegality::ReductionInstDesc
5260	LoopVectorizationLegality::isReductionInstr(Instruction *I,
5261	ReductionKind Kind,
5262	ReductionInstDesc &Prev) {
5263	bool FP = I->getType()->isFloatingPointTy();
5264	bool FastMath = FP && I->hasUnsafeAlgebra();
5265	switch (I->getOpcode()) {
5266	default:
5267	return ReductionInstDesc(false, I);
5268	case Instruction::PHI:
5269	if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
5270	Kind != RK_FloatMinMax))
5271	return ReductionInstDesc(false, I);
5272	return ReductionInstDesc(I, Prev.MinMaxKind);
5273	case Instruction::Sub:
5274	case Instruction::Add:
5275	return ReductionInstDesc(Kind == RK_IntegerAdd, I);
5276	case Instruction::Mul:
5277	return ReductionInstDesc(Kind == RK_IntegerMult, I);
5278	case Instruction::And:
5279	return ReductionInstDesc(Kind == RK_IntegerAnd, I);
5280	case Instruction::Or:
5281	return ReductionInstDesc(Kind == RK_IntegerOr, I);
5282	case Instruction::Xor:
5283	return ReductionInstDesc(Kind == RK_IntegerXor, I);
5284	case Instruction::FMul:
5285	return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
5286	case Instruction::FSub:
5287	case Instruction::FAdd:
5288	return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
5289	case Instruction::FCmp:
5290	case Instruction::ICmp:
5291	case Instruction::Select:
5292	if (Kind != RK_IntegerMinMax &&
5293	(!HasFunNoNaNAttr \|\| Kind != RK_FloatMinMax))
5294	return ReductionInstDesc(false, I);
5295	return isMinMaxSelectCmpPattern(I, Prev);
5296	}
5297	}
5298
5299	LoopVectorizationLegality::InductionKind
5300	LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
5301	Type *PhiTy = Phi->getType();
5302	// We only handle integer and pointer inductions variables.
5303	if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
5304	return IK_NoInduction;
5305
5306	// Check that the PHI is consecutive.
5307	const SCEV *PhiScev = SE->getSCEV(Phi);
5308	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
5309	if (!AR) {
5310	DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: PHI is not a poly recurrence.\n" ; } } while (0);
5311	return IK_NoInduction;
5312	}
5313	const SCEV Step = AR->getStepRecurrence(SE);
5314
5315	// Integer inductions need to have a stride of one.
5316	if (PhiTy->isIntegerTy()) {
5317	if (Step->isOne())
5318	return IK_IntInduction;
5319	if (Step->isAllOnesValue())
5320	return IK_ReverseIntInduction;
5321	return IK_NoInduction;
5322	}
5323
5324	// Calculate the pointer stride and check if it is consecutive.
5325	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
5326	if (!C)
5327	return IK_NoInduction;
5328
5329	assert(PhiTy->isPointerTy() && "The PHI must be a pointer")((PhiTy->isPointerTy() && "The PHI must be a pointer" ) ? static_cast<void> (0) : __assert_fail ("PhiTy->isPointerTy() && \"The PHI must be a pointer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5329, __PRETTY_FUNCTION__));
5330	Type *PointerElementType = PhiTy->getPointerElementType();
5331	// The pointer stride cannot be determined if the pointer element type is not
5332	// sized.
5333	if (!PointerElementType->isSized())
5334	return IK_NoInduction;
5335
5336	uint64_t Size = DL->getTypeAllocSize(PointerElementType);
5337	if (C->getValue()->equalsInt(Size))
5338	return IK_PtrInduction;
5339	else if (C->getValue()->equalsInt(0 - Size))
5340	return IK_ReversePtrInduction;
5341
5342	return IK_NoInduction;
5343	}
5344
5345	bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5346	Value In0 = const_cast<Value>(V);
5347	PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5348	if (!PN)
5349	return false;
5350
5351	return Inductions.count(PN);
5352	}
5353
5354	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5355	assert(TheLoop->contains(BB) && "Unknown block used")((TheLoop->contains(BB) && "Unknown block used") ? static_cast<void> (0) : __assert_fail ("TheLoop->contains(BB) && \"Unknown block used\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5355, __PRETTY_FUNCTION__));
5356
5357	// Blocks that do not dominate the latch need predication.
5358	BasicBlock* Latch = TheLoop->getLoopLatch();
5359	return !DT->dominates(BB, Latch);
5360	}
5361
5362	bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
5363	SmallPtrSetImpl<Value *> &SafePtrs) {
5364
5365	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5366	// Check that we don't have a constant expression that can trap as operand.
5367	for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
5368	OI != OE; ++OI) {
5369	if (Constant C = dyn_cast<Constant>(OI))
5370	if (C->canTrap())
5371	return false;
5372	}
5373	// We might be able to hoist the load.
5374	if (it->mayReadFromMemory()) {
5375	LoadInst *LI = dyn_cast<LoadInst>(it);
5376	if (!LI)
5377	return false;
5378	if (!SafePtrs.count(LI->getPointerOperand())) {
5379	if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
5380	MaskedOp.insert(LI);
5381	continue;
5382	}
5383	return false;
5384	}
5385	}
5386
5387	// We don't predicate stores at the moment.
5388	if (it->mayWriteToMemory()) {
5389	StoreInst *SI = dyn_cast<StoreInst>(it);
5390	// We only support predication of stores in basic blocks with one
5391	// predecessor.
5392	if (!SI)
5393	return false;
5394
5395	bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
5396	bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
5397
5398	if (++NumPredStores > NumberOfStoresToPredicate \|\| !isSafePtr \|\|
5399	!isSinglePredecessor) {
5400	// Build a masked store if it is legal for the target, otherwise scalarize
5401	// the block.
5402	bool isLegalMaskedOp =
5403	isLegalMaskedStore(SI->getValueOperand()->getType(),
5404	SI->getPointerOperand());
5405	if (isLegalMaskedOp) {
5406	--NumPredStores;
5407	MaskedOp.insert(SI);
5408	continue;
5409	}
5410	return false;
5411	}
5412	}
5413	if (it->mayThrow())
5414	return false;
5415
5416	// The instructions below can trap.
5417	switch (it->getOpcode()) {
5418	default: continue;
5419	case Instruction::UDiv:
5420	case Instruction::SDiv:
5421	case Instruction::URem:
5422	case Instruction::SRem:
5423	return false;
5424	}
5425	}
5426
5427	return true;
5428	}
5429
5430	LoopVectorizationCostModel::VectorizationFactor
5431	LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
5432	// Width 1 means no vectorize
5433	VectorizationFactor Factor = { 1U, 0U };
5434	if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
5435	emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
5436	DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n" ; } } while (0);
5437	return Factor;
5438	}
5439
5440	if (!EnableCondStoresVectorization && Legal->NumPredStores) {
5441	emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
5442	DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n" ; } } while (0);
5443	return Factor;
5444	}
5445
5446	// Find the trip count.
5447	unsigned TC = SE->getSmallConstantTripCount(TheLoop);
5448	DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found trip count: " << TC << '\n'; } } while (0);
5449
5450	unsigned WidestType = getWidestType();
5451	unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5452	unsigned MaxSafeDepDist = -1U;
5453	if (Legal->getMaxSafeDepDistBytes() != -1U)
5454	MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5455	WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
5456	WidestRegister : MaxSafeDepDist);
5457	unsigned MaxVectorSize = WidestRegister / WidestType;
5458	DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"; } } while (0);
5459	DEBUG(dbgs() << "LV: The Widest register is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0)
5460	<< WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0);
5461
5462	if (MaxVectorSize == 0) {
5463	DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n" ; } } while (0);
5464	MaxVectorSize = 1;
5465	}
5466
5467	assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"((MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5468, __PRETTY_FUNCTION__))
5468	" into one vector!")((MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5468, __PRETTY_FUNCTION__));
5469
5470	unsigned VF = MaxVectorSize;
5471
5472	// If we optimize the program for size, avoid creating the tail loop.
5473	if (OptForSize) {
5474	// If we are unable to calculate the trip count then don't try to vectorize.
5475	if (TC < 2) {
5476	emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
5477	DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0);
5478	return Factor;
5479	}
5480
5481	// Find the maximum SIMD width that can fit within the trip count.
5482	VF = TC % MaxVectorSize;
5483
5484	if (VF == 0)
5485	VF = MaxVectorSize;
5486
5487	// If the trip count that we found modulo the vectorization factor is not
5488	// zero then we require a tail.
5489	if (VF < 2) {
5490	emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
5491	"same time. Enable vectorization of this loop "
5492	"with '#pragma clang loop vectorize(enable)' "
5493	"when compiling with -Os");
5494	DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0);
5495	return Factor;
5496	}
5497	}
5498
5499	int UserVF = Hints->getWidth();
5500	if (UserVF != 0) {
5501	assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5501, __PRETTY_FUNCTION__));
5502	DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using user VF " << UserVF << ".\n"; } } while (0);
5503
5504	Factor.Width = UserVF;
5505	return Factor;
5506	}
5507
5508	float Cost = expectedCost(1);
5509	#ifndef NDEBUG
5510	const float ScalarCost = Cost;
5511	#endif /* NDEBUG */
5512	unsigned Width = 1;
5513	DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"; } } while (0);
5514
5515	bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5516	// Ignore scalar width, because the user explicitly wants vectorization.
5517	if (ForceVectorization && VF > 1) {
5518	Width = 2;
5519	Cost = expectedCost(Width) / (float)Width;
5520	}
5521
5522	for (unsigned i=2; i <= VF; i*=2) {
5523	// Notice that the vector loop needs to be executed less times, so
5524	// we need to divide the cost of the vector loops by the width of
5525	// the vector elements.
5526	float VectorCost = expectedCost(i) / (float)i;
5527	DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0)
5528	(int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0);
5529	if (VectorCost < Cost) {
5530	Cost = VectorCost;
5531	Width = i;
5532	}
5533	}
5534
5535	DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0)
5536	<< "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0)
5537	<< "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0);
5538	DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Selecting VF: "<< Width << ".\n"; } } while (0);
5539	Factor.Width = Width;
5540	Factor.Cost = Width * Cost;
5541	return Factor;
5542	}
5543
5544	unsigned LoopVectorizationCostModel::getWidestType() {
5545	unsigned MaxWidth = 8;
5546
5547	// For each block.
5548	for (Loop::block_iterator bb = TheLoop->block_begin(),
5549	be = TheLoop->block_end(); bb != be; ++bb) {
5550	BasicBlock BB = bb;
5551
5552	// For each instruction in the loop.
5553	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5554	Type *T = it->getType();
5555
5556	// Ignore ephemeral values.
5557	if (EphValues.count(it))
5558	continue;
5559
5560	// Only examine Loads, Stores and PHINodes.
5561	if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
5562	continue;
5563
5564	// Examine PHI nodes that are reduction variables.
5565	if (PHINode *PN = dyn_cast<PHINode>(it))
5566	if (!Legal->getReductionVars()->count(PN))
5567	continue;
5568
5569	// Examine the stored values.
5570	if (StoreInst *ST = dyn_cast<StoreInst>(it))
5571	T = ST->getValueOperand()->getType();
5572
5573	// Ignore loaded pointer types and stored pointer types that are not
5574	// consecutive. However, we do want to take consecutive stores/loads of
5575	// pointer vectors into account.
5576	if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
5577	continue;
5578
5579	MaxWidth = std::max(MaxWidth,
5580	(unsigned)DL->getTypeSizeInBits(T->getScalarType()));
5581	}
5582	}
5583
5584	return MaxWidth;
5585	}
5586
5587	unsigned
5588	LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
5589	unsigned VF,
5590	unsigned LoopCost) {
5591
5592	// -- The unroll heuristics --
5593	// We unroll the loop in order to expose ILP and reduce the loop overhead.
5594	// There are many micro-architectural considerations that we can't predict
5595	// at this level. For example, frontend pressure (on decode or fetch) due to
5596	// code size, or the number and capabilities of the execution ports.
5597	//
5598	// We use the following heuristics to select the unroll factor:
5599	// 1. If the code has reductions, then we unroll in order to break the cross
5600	// iteration dependency.
5601	// 2. If the loop is really small, then we unroll in order to reduce the loop
5602	// overhead.
5603	// 3. We don't unroll if we think that we will spill registers to memory due
5604	// to the increased register pressure.
5605
5606	// Use the user preference, unless 'auto' is selected.
5607	int UserUF = Hints->getInterleave();
5608	if (UserUF != 0)
5609	return UserUF;
5610
5611	// When we optimize for size, we don't unroll.
5612	if (OptForSize)
5613	return 1;
5614
5615	// We used the distance for the unroll factor.
5616	if (Legal->getMaxSafeDepDistBytes() != -1U)
5617	return 1;
5618
5619	// Do not unroll loops with a relatively small trip count.
5620	unsigned TC = SE->getSmallConstantTripCount(TheLoop);
5621	if (TC > 1 && TC < TinyTripCountUnrollThreshold)
5622	return 1;
5623
5624	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5625	DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0)
5626	" registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0);
5627
5628	if (VF == 1) {
5629	if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5630	TargetNumRegisters = ForceTargetNumScalarRegs;
5631	} else {
5632	if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5633	TargetNumRegisters = ForceTargetNumVectorRegs;
5634	}
5635
5636	LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
5637	// We divide by these constants so assume that we have at least one
5638	// instruction that uses at least one register.
5639	R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5640	R.NumInstructions = std::max(R.NumInstructions, 1U);
5641
5642	// We calculate the unroll factor using the following formula.
5643	// Subtract the number of loop invariants from the number of available
5644	// registers. These registers are used by all of the unrolled instances.
5645	// Next, divide the remaining registers by the number of registers that is
5646	// required by the loop, in order to estimate how many parallel instances
5647	// fit without causing spills. All of this is rounded down if necessary to be
5648	// a power of two. We want power of two unroll factors to simplify any
5649	// addressing operations or alignment considerations.
5650	unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5651	R.MaxLocalUsers);
5652
5653	// Don't count the induction variable as unrolled.
5654	if (EnableIndVarRegisterHeur)
5655	UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5656	std::max(1U, (R.MaxLocalUsers - 1)));
5657
5658	// Clamp the unroll factor ranges to reasonable factors.
5659	unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
5660
5661	// Check if the user has overridden the unroll max.
5662	if (VF == 1) {
5663	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5664	MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
5665	} else {
5666	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5667	MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
5668	}
5669
5670	// If we did not calculate the cost for VF (because the user selected the VF)
5671	// then we calculate the cost of VF here.
5672	if (LoopCost == 0)
5673	LoopCost = expectedCost(VF);
5674
5675	// Clamp the calculated UF to be between the 1 and the max unroll factor
5676	// that the target allows.
5677	if (UF > MaxInterleaveSize)
5678	UF = MaxInterleaveSize;
5679	else if (UF < 1)
5680	UF = 1;
5681
5682	// Unroll if we vectorized this loop and there is a reduction that could
5683	// benefit from unrolling.
5684	if (VF > 1 && Legal->getReductionVars()->size()) {
5685	DEBUG(dbgs() << "LV: Unrolling because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling because of reductions.\n" ; } } while (0);
5686	return UF;
5687	}
5688
5689	// Note that if we've already vectorized the loop we will have done the
5690	// runtime check and so unrolling won't require further checks.
5691	bool UnrollingRequiresRuntimePointerCheck =
5692	(VF == 1 && Legal->getRuntimePointerCheck()->Need);
5693
5694	// We want to unroll small loops in order to reduce the loop overhead and
5695	// potentially expose ILP opportunities.
5696	DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n'; } } while (0);
5697	if (!UnrollingRequiresRuntimePointerCheck &&
5698	LoopCost < SmallLoopCost) {
5699	// We assume that the cost overhead is 1 and we use the cost model
5700	// to estimate the cost of the loop and unroll until the cost of the
5701	// loop overhead is about 5% of the cost of the loop.
5702	unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5703
5704	// Unroll until store/load ports (estimated by max unroll factor) are
5705	// saturated.
5706	unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
5707	unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
5708
5709	// If we have a scalar reduction (vector reductions are already dealt with
5710	// by this point), we can increase the critical path length if the loop
5711	// we're unrolling is inside another loop. Limit, by default to 2, so the
5712	// critical path only gets increased by one reduction operation.
5713	if (Legal->getReductionVars()->size() &&
5714	TheLoop->getLoopDepth() > 1) {
5715	unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
5716	SmallUF = std::min(SmallUF, F);
5717	StoresUF = std::min(StoresUF, F);
5718	LoadsUF = std::min(LoadsUF, F);
5719	}
5720
5721	if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
5722	DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to saturate store or load ports.\n" ; } } while (0);
5723	return std::max(StoresUF, LoadsUF);
5724	}
5725
5726	DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to reduce branch cost.\n" ; } } while (0);
5727	return SmallUF;
5728	}
5729
5730	DEBUG(dbgs() << "LV: Not Unrolling.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not Unrolling.\n"; } } while (0);
5731	return 1;
5732	}
5733
5734	LoopVectorizationCostModel::RegisterUsage
5735	LoopVectorizationCostModel::calculateRegisterUsage() {
5736	// This function calculates the register usage by measuring the highest number
5737	// of values that are alive at a single location. Obviously, this is a very
5738	// rough estimation. We scan the loop in a topological order in order and
5739	// assign a number to each instruction. We use RPO to ensure that defs are
5740	// met before their users. We assume that each instruction that has in-loop
5741	// users starts an interval. We record every time that an in-loop value is
5742	// used, so we have a list of the first and last occurrences of each
5743	// instruction. Next, we transpose this data structure into a multi map that
5744	// holds the list of intervals that end at a specific location. This multi
5745	// map allows us to perform a linear search. We scan the instructions linearly
5746	// and record each time that a new interval starts, by placing it in a set.
5747	// If we find this value in the multi-map then we remove it from the set.
5748	// The max register usage is the maximum size of the set.
5749	// We also search for instructions that are defined outside the loop, but are
5750	// used inside the loop. We need this number separately from the max-interval
5751	// usage number because when we unroll, loop-invariant values do not take
5752	// more register.
5753	LoopBlocksDFS DFS(TheLoop);
5754	DFS.perform(LI);
5755
5756	RegisterUsage R;
5757	R.NumInstructions = 0;
5758
5759	// Each 'key' in the map opens a new interval. The values
5760	// of the map are the index of the 'last seen' usage of the
5761	// instruction that is the key.
5762	typedef DenseMap<Instruction*, unsigned> IntervalMap;
5763	// Maps instruction to its index.
5764	DenseMap<unsigned, Instruction*> IdxToInstr;
5765	// Marks the end of each interval.
5766	IntervalMap EndPoint;
5767	// Saves the list of instruction indices that are used in the loop.
5768	SmallSet<Instruction*, 8> Ends;
5769	// Saves the list of values that are used in the loop but are
5770	// defined outside the loop, such as arguments and constants.
5771	SmallPtrSet<Value*, 8> LoopInvariants;
5772
5773	unsigned Index = 0;
5774	for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
5775	be = DFS.endRPO(); bb != be; ++bb) {
5776	R.NumInstructions += (*bb)->size();
5777	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
5778	++it) {
5779	Instruction *I = it;
5780	IdxToInstr[Index++] = I;
5781
5782	// Save the end location of each USE.
5783	for (unsigned i = 0; i < I->getNumOperands(); ++i) {
5784	Value *U = I->getOperand(i);
5785	Instruction *Instr = dyn_cast<Instruction>(U);
5786
5787	// Ignore non-instruction values such as arguments, constants, etc.
5788	if (!Instr) continue;
5789
5790	// If this instruction is outside the loop then record it and continue.
5791	if (!TheLoop->contains(Instr)) {
5792	LoopInvariants.insert(Instr);
5793	continue;
5794	}
5795
5796	// Overwrite previous end points.
5797	EndPoint[Instr] = Index;
5798	Ends.insert(Instr);
5799	}
5800	}
5801	}
5802
5803	// Saves the list of intervals that end with the index in 'key'.
5804	typedef SmallVector<Instruction*, 2> InstrList;
5805	DenseMap<unsigned, InstrList> TransposeEnds;
5806
5807	// Transpose the EndPoints to a list of values that end at each index.
5808	for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
5809	it != e; ++it)
5810	TransposeEnds[it->second].push_back(it->first);
5811
5812	SmallSet<Instruction*, 8> OpenIntervals;
5813	unsigned MaxUsage = 0;
5814
5815
5816	DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n" ; } } while (0);
5817	for (unsigned int i = 0; i < Index; ++i) {
5818	Instruction *I = IdxToInstr[i];
5819	// Ignore instructions that are never used within the loop.
5820	if (!Ends.count(I)) continue;
5821
5822	// Ignore ephemeral values.
5823	if (EphValues.count(I))
5824	continue;
5825
5826	// Remove all of the instructions that end at this location.
5827	InstrList &List = TransposeEnds[i];
5828	for (unsigned int j=0, e = List.size(); j < e; ++j)
5829	OpenIntervals.erase(List[j]);
5830
5831	// Count the number of live interals.
5832	MaxUsage = std::max(MaxUsage, OpenIntervals.size());
5833
5834	DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0)
5835	OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0);
5836
5837	// Add the current instruction to the list of open intervals.
5838	OpenIntervals.insert(I);
5839	}
5840
5841	unsigned Invariant = LoopInvariants.size();
5842	DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'; } } while (0);
5843	DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'; } } while (0);
5844	DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'; } } while (0);
5845
5846	R.LoopInvariantRegs = Invariant;
5847	R.MaxLocalUsers = MaxUsage;
5848	return R;
5849	}
5850
5851	unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
5852	unsigned Cost = 0;
5853
5854	// For each block.
5855	for (Loop::block_iterator bb = TheLoop->block_begin(),
5856	be = TheLoop->block_end(); bb != be; ++bb) {
5857	unsigned BlockCost = 0;
5858	BasicBlock BB = bb;
5859
5860	// For each instruction in the old loop.
5861	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
5862	// Skip dbg intrinsics.
5863	if (isa<DbgInfoIntrinsic>(it))
5864	continue;
5865
5866	// Ignore ephemeral values.
5867	if (EphValues.count(it))
5868	continue;
5869
5870	unsigned C = getInstructionCost(it, VF);
5871
5872	// Check if we should override the cost.
5873	if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5874	C = ForceTargetInstructionCost;
5875
5876	BlockCost += C;
5877	DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0)
5878	VF << " For instruction: " << it << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << it << '\n'; } } while (0);
5879	}
5880
5881	// We assume that if-converted blocks have a 50% chance of being executed.
5882	// When the code is scalar then some of the blocks are avoided due to CF.
5883	// When the code is vectorized we execute all code paths.
5884	if (VF == 1 && Legal->blockNeedsPredication(*bb))
5885	BlockCost /= 2;
5886
5887	Cost += BlockCost;
5888	}
5889
5890	return Cost;
5891	}
5892
5893	/// \brief Check whether the address computation for a non-consecutive memory
5894	/// access looks like an unlikely candidate for being merged into the indexing
5895	/// mode.
5896	///
5897	/// We look for a GEP which has one index that is an induction variable and all
5898	/// other indices are loop invariant. If the stride of this access is also
5899	/// within a small bound we decide that this address computation can likely be
5900	/// merged into the addressing mode.
5901	/// In all other cases, we identify the address computation as complex.
5902	static bool isLikelyComplexAddressComputation(Value *Ptr,
5903	LoopVectorizationLegality *Legal,
5904	ScalarEvolution *SE,
5905	const Loop *TheLoop) {
5906	GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5907	if (!Gep)
5908	return true;
5909
5910	// We are looking for a gep with all loop invariant indices except for one
5911	// which should be an induction variable.
5912	unsigned NumOperands = Gep->getNumOperands();
5913	for (unsigned i = 1; i < NumOperands; ++i) {
5914	Value *Opd = Gep->getOperand(i);
5915	if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5916	!Legal->isInductionVariable(Opd))
5917	return true;
5918	}
5919
5920	// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
5921	// can likely be merged into the address computation.
5922	unsigned MaxMergeDistance = 64;
5923
5924	const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
5925	if (!AddRec)
5926	return true;
5927
5928	// Check the step is constant.
5929	const SCEV Step = AddRec->getStepRecurrence(SE);
5930	// Calculate the pointer stride and check if it is consecutive.
5931	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
5932	if (!C)
5933	return true;
5934
5935	const APInt &APStepVal = C->getValue()->getValue();
5936
5937	// Huge step value - give up.
5938	if (APStepVal.getBitWidth() > 64)
5939	return true;
5940
5941	int64_t StepVal = APStepVal.getSExtValue();
5942
5943	return StepVal > MaxMergeDistance;
5944	}
5945
5946	static bool isStrideMul(Instruction I, LoopVectorizationLegality Legal) {
5947	if (Legal->hasStride(I->getOperand(0)) \|\| Legal->hasStride(I->getOperand(1)))
5948	return true;
5949	return false;
5950	}
5951
5952	unsigned
5953	LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5954	// If we know that this instruction will remain uniform, check the cost of
5955	// the scalar version.
5956	if (Legal->isUniformAfterVectorization(I))
5957	VF = 1;
5958
5959	Type *RetTy = I->getType();
5960	Type *VectorTy = ToVectorTy(RetTy, VF);
5961
5962	// TODO: We need to estimate the cost of intrinsic calls.
5963	switch (I->getOpcode()) {
5964	case Instruction::GetElementPtr:
5965	// We mark this instruction as zero-cost because the cost of GEPs in
5966	// vectorized code depends on whether the corresponding memory instruction
5967	// is scalarized or not. Therefore, we handle GEPs with the memory
5968	// instruction cost.
5969	return 0;
5970	case Instruction::Br: {
5971	return TTI.getCFInstrCost(I->getOpcode());
5972	}
5973	case Instruction::PHI:
5974	//TODO: IF-converted IFs become selects.
5975	return 0;
5976	case Instruction::Add:
5977	case Instruction::FAdd:
5978	case Instruction::Sub:
5979	case Instruction::FSub:
5980	case Instruction::Mul:
5981	case Instruction::FMul:
5982	case Instruction::UDiv:
5983	case Instruction::SDiv:
5984	case Instruction::FDiv:
5985	case Instruction::URem:
5986	case Instruction::SRem:
5987	case Instruction::FRem:
5988	case Instruction::Shl:
5989	case Instruction::LShr:
5990	case Instruction::AShr:
5991	case Instruction::And:
5992	case Instruction::Or:
5993	case Instruction::Xor: {
5994	// Since we will replace the stride by 1 the multiplication should go away.
5995	if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5996	return 0;
5997	// Certain instructions can be cheaper to vectorize if they have a constant
5998	// second vector operand. One example of this are shifts on x86.
5999	TargetTransformInfo::OperandValueKind Op1VK =
6000	TargetTransformInfo::OK_AnyValue;
6001	TargetTransformInfo::OperandValueKind Op2VK =
6002	TargetTransformInfo::OK_AnyValue;
6003	TargetTransformInfo::OperandValueProperties Op1VP =
6004	TargetTransformInfo::OP_None;
6005	TargetTransformInfo::OperandValueProperties Op2VP =
6006	TargetTransformInfo::OP_None;
6007	Value *Op2 = I->getOperand(1);
6008
6009	// Check for a splat of a constant or for a non uniform vector of constants.
6010	if (isa<ConstantInt>(Op2)) {
6011	ConstantInt *CInt = cast<ConstantInt>(Op2);
6012	if (CInt && CInt->getValue().isPowerOf2())
6013	Op2VP = TargetTransformInfo::OP_PowerOf2;
6014	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
6015	} else if (isa<ConstantVector>(Op2) \|\| isa<ConstantDataVector>(Op2)) {
6016	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
6017	Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
6018	if (SplatValue) {
6019	ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
6020	if (CInt && CInt->getValue().isPowerOf2())
6021	Op2VP = TargetTransformInfo::OP_PowerOf2;
6022	Op2VK = TargetTransformInfo::OK_UniformConstantValue;
6023	}
6024	}
6025
6026	return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
6027	Op1VP, Op2VP);
6028	}
6029	case Instruction::Select: {
6030	SelectInst *SI = cast<SelectInst>(I);
6031	const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6032	bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6033	Type *CondTy = SI->getCondition()->getType();
6034	if (!ScalarCond)
6035	CondTy = VectorType::get(CondTy, VF);
6036
6037	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
6038	}
6039	case Instruction::ICmp:
6040	case Instruction::FCmp: {
6041	Type *ValTy = I->getOperand(0)->getType();
6042	VectorTy = ToVectorTy(ValTy, VF);
6043	return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
6044	}
6045	case Instruction::Store:
6046	case Instruction::Load: {
6047	StoreInst *SI = dyn_cast<StoreInst>(I);
6048	LoadInst *LI = dyn_cast<LoadInst>(I);
6049	Type *ValTy = (SI ? SI->getValueOperand()->getType() :
6050	LI->getType());
6051	VectorTy = ToVectorTy(ValTy, VF);
6052
6053	unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
6054	unsigned AS = SI ? SI->getPointerAddressSpace() :
6055	LI->getPointerAddressSpace();
6056	Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
6057	// We add the cost of address computation here instead of with the gep
6058	// instruction because only here we know whether the operation is
6059	// scalarized.
6060	if (VF == 1)
6061	return TTI.getAddressComputationCost(VectorTy) +
6062	TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
6063
6064	// Scalarized loads/stores.
6065	int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6066	bool Reverse = ConsecutiveStride < 0;
6067	unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
6068	unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
6069	if (!ConsecutiveStride \|\| ScalarAllocatedSize != VectorElementSize) {
6070	bool IsComplexComputation =
6071	isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
6072	unsigned Cost = 0;
6073	// The cost of extracting from the value vector and pointer vector.
6074	Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6075	for (unsigned i = 0; i < VF; ++i) {
6076	// The cost of extracting the pointer operand.
6077	Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
6078	// In case of STORE, the cost of ExtractElement from the vector.
6079	// In case of LOAD, the cost of InsertElement into the returned
6080	// vector.
6081	Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
6082	Instruction::InsertElement,
6083	VectorTy, i);
6084	}
6085
6086	// The cost of the scalar loads/stores.
6087	Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
6088	Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
6089	Alignment, AS);
6090	return Cost;
6091	}
6092
6093	// Wide load/stores.
6094	unsigned Cost = TTI.getAddressComputationCost(VectorTy);
6095	Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
6096
6097	if (Reverse)
6098	Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
6099	VectorTy, 0);
6100	return Cost;
6101	}
6102	case Instruction::ZExt:
6103	case Instruction::SExt:
6104	case Instruction::FPToUI:
6105	case Instruction::FPToSI:
6106	case Instruction::FPExt:
6107	case Instruction::PtrToInt:
6108	case Instruction::IntToPtr:
6109	case Instruction::SIToFP:
6110	case Instruction::UIToFP:
6111	case Instruction::Trunc:
6112	case Instruction::FPTrunc:
6113	case Instruction::BitCast: {
6114	// We optimize the truncation of induction variable.
6115	// The cost of these is the same as the scalar operation.
6116	if (I->getOpcode() == Instruction::Trunc &&
6117	Legal->isInductionVariable(I->getOperand(0)))
6118	return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
6119	I->getOperand(0)->getType());
6120
6121	Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
6122	return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
6123	}
6124	case Instruction::Call: {
6125	CallInst *CI = cast<CallInst>(I);
6126	Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
6127	assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6127, __PRETTY_FUNCTION__));
6128	Type *RetTy = ToVectorTy(CI->getType(), VF);
6129	SmallVector<Type*, 4> Tys;
6130	for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
6131	Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
6132	return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
6133	}
6134	default: {
6135	// We are scalarizing the instruction. Return the cost of the scalar
6136	// instruction, plus the cost of insert and extract into vector
6137	// elements, times the vector width.
6138	unsigned Cost = 0;
6139
6140	if (!RetTy->isVoidTy() && VF != 1) {
6141	unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
6142	VectorTy);
6143	unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
6144	VectorTy);
6145
6146	// The cost of inserting the results plus extracting each one of the
6147	// operands.
6148	Cost += VF * (InsCost + ExtCost * I->getNumOperands());
6149	}
6150
6151	// The cost of executing VF copies of the scalar instruction. This opcode
6152	// is unknown. Assume that it is the same as 'mul'.
6153	Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6154	return Cost;
6155	}
6156	}// end of switch.
6157	}
6158
6159	Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
6160	if (Scalar->isVoidTy() \|\| VF == 1)
6161	return Scalar;
6162	return VectorType::get(Scalar, VF);
6163	}
6164
6165	char LoopVectorize::ID = 0;
6166	static const char lv_name[] = "Loop Vectorization";
6167	INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void* initializeLoopVectorizePassOnce(PassRegistry & Registry) {
6168	INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)initializeTargetTransformInfoAnalysisGroup(Registry);
6169	INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry);
6170	INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)initializeAssumptionTrackerPass(Registry);
6171	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)initializeBlockFrequencyInfoPass(Registry);
6172	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6173	INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry);
6174	INITIALIZE_PASS_DEPENDENCY(LCSSA)initializeLCSSAPass(Registry);
6175	INITIALIZE_PASS_DEPENDENCY(LoopInfo)initializeLoopInfoPass(Registry);
6176	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);
6177	INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo PI = new PassInfo(lv_name, "loop-vectorize", & LoopVectorize ::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize >), false, false); Registry.registerPass(PI, true); return PI; } void llvm::initializeLoopVectorizePass(PassRegistry & Registry) { static volatile sys::cas_flag initialized = 0; sys ::cas_flag old_val = sys::CompareAndSwap(&initialized, 1, 0); if (old_val == 0) { initializeLoopVectorizePassOnce(Registry ); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177, &initialized); initialized = 2; AnnotateIgnoreWritesEnd ("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177); } else { sys::cas_flag tmp = initialized; sys::MemoryFence (); while (tmp != 2) { tmp = initialized; sys::MemoryFence(); } } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177, &initialized); }
6178
6179	namespace llvm {
6180	Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
6181	return new LoopVectorize(NoUnrolling, AlwaysVectorize);
6182	}
6183	}
6184
6185	bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6186	// Check for a store.
6187	if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
6188	return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
6189
6190	// Check for a load.
6191	if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
6192	return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
6193
6194	return false;
6195	}
6196
6197
6198	void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
6199	bool IfPredicateStore) {
6200	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6200, __PRETTY_FUNCTION__));
6201	// Holds vector parameters or scalars, in case of uniform vals.
6202	SmallVector<VectorParts, 4> Params;
6203
6204	setDebugLocFromInst(Builder, Instr);
6205
6206	// Find all of the vectorized parameters.
6207	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
6208	Value *SrcOp = Instr->getOperand(op);
6209
6210	// If we are accessing the old induction variable, use the new one.
6211	if (SrcOp == OldInduction) {
6212	Params.push_back(getVectorValue(SrcOp));
6213	continue;
6214	}
6215
6216	// Try using previously calculated values.
6217	Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
6218
6219	// If the src is an instruction that appeared earlier in the basic block
6220	// then it should already be vectorized.
6221	if (SrcInst && OrigLoop->contains(SrcInst)) {
6222	assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6222, __PRETTY_FUNCTION__));
6223	// The parameter is a vector value from earlier.
6224	Params.push_back(WidenMap.get(SrcInst));
6225	} else {
6226	// The parameter is a scalar from outside the loop. Maybe even a constant.
6227	VectorParts Scalars;
6228	Scalars.append(UF, SrcOp);
6229	Params.push_back(Scalars);
6230	}
6231	}
6232
6233	assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6234, __PRETTY_FUNCTION__))
6234	"Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6234, __PRETTY_FUNCTION__));
6235
6236	// Does this instruction return a value ?
6237	bool IsVoidRetTy = Instr->getType()->isVoidTy();
6238
6239	Value *UndefVec = IsVoidRetTy ? nullptr :
6240	UndefValue::get(Instr->getType());
6241	// Create a new entry in the WidenMap and initialize it to Undef or Null.
6242	VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
6243
6244	Instruction *InsertPt = Builder.GetInsertPoint();
6245	BasicBlock *IfBlock = Builder.GetInsertBlock();
6246	BasicBlock *CondBlock = nullptr;
6247
6248	VectorParts Cond;
6249	Loop *VectorLp = nullptr;
6250	if (IfPredicateStore) {
6251	assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6252, __PRETTY_FUNCTION__))
6252	"Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6252, __PRETTY_FUNCTION__));
6253	Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
6254	Instr->getParent());
6255	VectorLp = LI->getLoopFor(IfBlock);
6256	assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6256, __PRETTY_FUNCTION__));
6257	}
6258
6259	// For each vector unroll 'part':
6260	for (unsigned Part = 0; Part < UF; ++Part) {
6261	// For each scalar that we create:
6262
6263	// Start an "if (pred) a[i] = ..." block.
6264	Value *Cmp = nullptr;
6265	if (IfPredicateStore) {
6266	if (Cond[Part]->getType()->isVectorTy())
6267	Cond[Part] =
6268	Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
6269	Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
6270	ConstantInt::get(Cond[Part]->getType(), 1));
6271	CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
6272	LoopVectorBody.push_back(CondBlock);
6273	VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
6274	// Update Builder with newly created basic block.
6275	Builder.SetInsertPoint(InsertPt);
6276	}
6277
6278	Instruction *Cloned = Instr->clone();
6279	if (!IsVoidRetTy)
6280	Cloned->setName(Instr->getName() + ".cloned");
6281	// Replace the operands of the cloned instructions with extracted scalars.
6282	for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
6283	Value *Op = Params[op][Part];
6284	Cloned->setOperand(op, Op);
6285	}
6286
6287	// Place the cloned scalar in the new loop.
6288	Builder.Insert(Cloned);
6289
6290	// If the original scalar returns a value we need to place it in a vector
6291	// so that future users will be able to use it.
6292	if (!IsVoidRetTy)
6293	VecResults[Part] = Cloned;
6294
6295	// End if-block.
6296	if (IfPredicateStore) {
6297	BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
6298	LoopVectorBody.push_back(NewIfBlock);
6299	VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
6300	Builder.SetInsertPoint(InsertPt);
6301	Instruction *OldBr = IfBlock->getTerminator();
6302	BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
6303	OldBr->eraseFromParent();
6304	IfBlock = NewIfBlock;
6305	}
6306	}
6307	}
6308
6309	void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
6310	StoreInst *SI = dyn_cast<StoreInst>(Instr);
6311	bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
6312
6313	return scalarizeInstruction(Instr, IfPredicateStore);
6314	}
6315
6316	Value InnerLoopUnroller::reverseVector(Value Vec) {
6317	return Vec;
6318	}
6319
6320	Value InnerLoopUnroller::getBroadcastInstrs(Value V) {
6321	return V;
6322	}
6323
6324	Value InnerLoopUnroller::getConsecutiveVector(Value Val, int StartIdx,
6325	bool Negate) {
6326	// When unrolling and the VF is 1, we only need to add a simple scalar.
6327	Type *ITy = Val->getType();
6328	assert(!ITy->isVectorTy() && "Val must be a scalar")((!ITy->isVectorTy() && "Val must be a scalar") ? static_cast <void> (0) : __assert_fail ("!ITy->isVectorTy() && \"Val must be a scalar\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6328, __PRETTY_FUNCTION__));
6329	Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
6330	return Builder.CreateAdd(Val, C, "induction");
6331	}