/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp

Bug Summary

File:	lib/Target/X86/X86ISelLowering.cpp
Warning:	line 27192, column 5 Value stored to 'AllowIntDomain' is never read

Annotated Source Code

1	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2	//
3	// The LLVM Compiler Infrastructure
4	//
5	// This file is distributed under the University of Illinois Open Source
6	// License. See LICENSE.TXT for details.
7	//
8	//===----------------------------------------------------------------------===//
9	//
10	// This file defines the interfaces that X86 uses to lower LLVM code into a
11	// selection DAG.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "X86ISelLowering.h"
16	#include "Utils/X86ShuffleDecode.h"
17	#include "X86CallingConv.h"
18	#include "X86FrameLowering.h"
19	#include "X86InstrBuilder.h"
20	#include "X86IntrinsicsInfo.h"
21	#include "X86MachineFunctionInfo.h"
22	#include "X86ShuffleDecodeConstantPool.h"
23	#include "X86TargetMachine.h"
24	#include "X86TargetObjectFile.h"
25	#include "llvm/ADT/SmallBitVector.h"
26	#include "llvm/ADT/SmallSet.h"
27	#include "llvm/ADT/Statistic.h"
28	#include "llvm/ADT/StringExtras.h"
29	#include "llvm/ADT/StringSwitch.h"
30	#include "llvm/Analysis/EHPersonalities.h"
31	#include "llvm/CodeGen/IntrinsicLowering.h"
32	#include "llvm/CodeGen/MachineFrameInfo.h"
33	#include "llvm/CodeGen/MachineFunction.h"
34	#include "llvm/CodeGen/MachineInstrBuilder.h"
35	#include "llvm/CodeGen/MachineJumpTableInfo.h"
36	#include "llvm/CodeGen/MachineModuleInfo.h"
37	#include "llvm/CodeGen/MachineRegisterInfo.h"
38	#include "llvm/CodeGen/WinEHFuncInfo.h"
39	#include "llvm/IR/CallSite.h"
40	#include "llvm/IR/CallingConv.h"
41	#include "llvm/IR/Constants.h"
42	#include "llvm/IR/DerivedTypes.h"
43	#include "llvm/IR/DiagnosticInfo.h"
44	#include "llvm/IR/Function.h"
45	#include "llvm/IR/GlobalAlias.h"
46	#include "llvm/IR/GlobalVariable.h"
47	#include "llvm/IR/Instructions.h"
48	#include "llvm/IR/Intrinsics.h"
49	#include "llvm/MC/MCAsmInfo.h"
50	#include "llvm/MC/MCContext.h"
51	#include "llvm/MC/MCExpr.h"
52	#include "llvm/MC/MCSymbol.h"
53	#include "llvm/Support/CommandLine.h"
54	#include "llvm/Support/Debug.h"
55	#include "llvm/Support/ErrorHandling.h"
56	#include "llvm/Support/KnownBits.h"
57	#include "llvm/Support/MathExtras.h"
58	#include "llvm/Target/TargetLowering.h"
59	#include "llvm/Target/TargetOptions.h"
60	#include <algorithm>
61	#include <bitset>
62	#include <cctype>
63	#include <numeric>
64	using namespace llvm;
65
66	#define DEBUG_TYPE"x86-isel" "x86-isel"
67
68	STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls" , "Number of tail calls", {0}, false};
69
70	static cl::opt<bool> ExperimentalVectorWideningLegalization(
71	"x86-experimental-vector-widening-legalization", cl::init(false),
72	cl::desc("Enable an experimental vector type legalization through widening "
73	"rather than promotion."),
74	cl::Hidden);
75
76	static cl::opt<int> ExperimentalPrefLoopAlignment(
77	"x86-experimental-pref-loop-alignment", cl::init(4),
78	cl::desc("Sets the preferable loop alignment for experiments "
79	"(the last x86-experimental-pref-loop-alignment bits"
80	" of the loop header PC will be 0)."),
81	cl::Hidden);
82
83	static cl::opt<bool> MulConstantOptimization(
84	"mul-constant-optimization", cl::init(true),
85	cl::desc("Replace 'mul x, Const' with more effective instructions like "
86	"SHIFT, LEA, etc."),
87	cl::Hidden);
88
89	/// Call this when the user attempts to do something unsupported, like
90	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91	/// report_fatal_error, so calling code should attempt to recover without
92	/// crashing.
93	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94	const char *Msg) {
95	MachineFunction &MF = DAG.getMachineFunction();
96	DAG.getContext()->diagnose(
97	DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
98	}
99
100	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101	const X86Subtarget &STI)
102	: TargetLowering(TM), Subtarget(STI) {
103	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104	X86ScalarSSEf64 = Subtarget.hasSSE2();
105	X86ScalarSSEf32 = Subtarget.hasSSE1();
106	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107
108	// Set up the TargetLowering object.
109
110	// X86 is weird. It always uses i8 for shift amounts and setcc results.
111	setBooleanContents(ZeroOrOneBooleanContent);
112	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114
115	// For 64-bit, since we have so many registers, use the ILP scheduler.
116	// For 32-bit, use the register pressure specific scheduling.
117	// For Atom, always use ILP scheduling.
118	if (Subtarget.isAtom())
119	setSchedulingPreference(Sched::ILP);
120	else if (Subtarget.is64Bit())
121	setSchedulingPreference(Sched::ILP);
122	else
123	setSchedulingPreference(Sched::RegPressure);
124	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
126
127	// Bypass expensive divides and use cheaper ones.
128	if (TM.getOptLevel() >= CodeGenOpt::Default) {
129	if (Subtarget.hasSlowDivide32())
130	addBypassSlowDiv(32, 8);
131	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132	addBypassSlowDiv(64, 32);
133	}
134
135	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
136	Subtarget.isTargetWindowsItanium()) {
137	// Setup Windows compiler runtime calls.
138	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140	setLibcallName(RTLIB::SREM_I64, "_allrem");
141	setLibcallName(RTLIB::UREM_I64, "_aullrem");
142	setLibcallName(RTLIB::MUL_I64, "_allmul");
143	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
148	}
149
150	if (Subtarget.isTargetDarwin()) {
151	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152	setUseUnderscoreSetJmp(false);
153	setUseUnderscoreLongJmp(false);
154	} else if (Subtarget.isTargetWindowsGNU()) {
155	// MS runtime is weird: it exports _setjmp, but longjmp!
156	setUseUnderscoreSetJmp(true);
157	setUseUnderscoreLongJmp(false);
158	} else {
159	setUseUnderscoreSetJmp(true);
160	setUseUnderscoreLongJmp(true);
161	}
162
163	// Set up the register classes.
164	addRegisterClass(MVT::i8, &X86::GR8RegClass);
165	addRegisterClass(MVT::i16, &X86::GR16RegClass);
166	addRegisterClass(MVT::i32, &X86::GR32RegClass);
167	if (Subtarget.is64Bit())
168	addRegisterClass(MVT::i64, &X86::GR64RegClass);
169
170	for (MVT VT : MVT::integer_valuetypes())
171	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
172
173	// We don't accept any truncstore of integer registers.
174	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179	setTruncStoreAction(MVT::i16, MVT::i8, Expand);
180
181	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
182
183	// SETOEQ and SETUNE require checking two conditions.
184	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
190
191	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
192	// operation.
193	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
194	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
195	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
196
197	if (Subtarget.is64Bit()) {
198	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199	// f32/f64 are legal, f80 is custom.
200	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
201	else
202	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
203	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
204	} else if (!Subtarget.useSoftFloat()) {
205	// We have an algorithm for SSE2->double, and we turn this into a
206	// 64-bit FILD followed by conditional FADD for other targets.
207	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
208	// We have an algorithm for SSE2, and we turn this into a 64-bit
209	// FILD or VCVTUSI2SS/SD for other targets.
210	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
211	}
212
213	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
214	// this operation.
215	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
216	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
217
218	if (!Subtarget.useSoftFloat()) {
219	// SSE has no i16 to fp conversion, only i32.
220	if (X86ScalarSSEf32) {
221	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
222	// f32 and f64 cases are Legal, f80 case is not
223	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
224	} else {
225	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
226	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
227	}
228	} else {
229	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
231	}
232
233	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
234	// this operation.
235	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
236	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
237
238	if (!Subtarget.useSoftFloat()) {
239	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240	// are Legal, f80 is custom lowered.
241	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
242	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
243
244	if (X86ScalarSSEf32) {
245	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
246	// f32 and f64 cases are Legal, f80 case is not
247	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
248	} else {
249	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
250	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
251	}
252	} else {
253	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
255	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
256	}
257
258	// Handle FP_TO_UINT by promoting the destination to a larger signed
259	// conversion.
260	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
261	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
262	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
263
264	if (Subtarget.is64Bit()) {
265	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
269	} else {
270	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
271	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
272	}
273	} else if (!Subtarget.useSoftFloat()) {
274	// Since AVX is a superset of SSE3, only check for SSE here.
275	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276	// Expand FP_TO_UINT into a select.
277	// FIXME: We would like to use a Custom expander here eventually to do
278	// the optimal thing for SSE vs. the default expansion in the legalizer.
279	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
280	else
281	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282	// With SSE3 we can use fisttpll to convert to a signed i64; without
283	// SSE, we're stuck with a fistpll.
284	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
285
286	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
287	}
288
289	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
290	if (!X86ScalarSSEf64) {
291	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
292	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
293	if (Subtarget.is64Bit()) {
294	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
295	// Without SSE, i64->f64 goes through memory.
296	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
297	}
298	} else if (!Subtarget.is64Bit())
299	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
300
301	// Scalar integer divide and remainder are lowered to use operations that
302	// produce two results, to match the available instructions. This exposes
303	// the two-result form to trivial CSE, which is able to combine x/y and x%y
304	// into a single instruction.
305	//
306	// Scalar integer multiply-high is also lowered to use two-result
307	// operations, to match the available instructions. However, plain multiply
308	// (low) operations are left as Legal, as there are single-result
309	// instructions for this in x86. Using the two-result multiply instructions
310	// when both high and low results are needed must be arranged by dagcombine.
311	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312	setOperationAction(ISD::MULHS, VT, Expand);
313	setOperationAction(ISD::MULHU, VT, Expand);
314	setOperationAction(ISD::SDIV, VT, Expand);
315	setOperationAction(ISD::UDIV, VT, Expand);
316	setOperationAction(ISD::SREM, VT, Expand);
317	setOperationAction(ISD::UREM, VT, Expand);
318	}
319
320	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
321	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
322	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
323	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
324	setOperationAction(ISD::BR_CC, VT, Expand);
325	setOperationAction(ISD::SELECT_CC, VT, Expand);
326	}
327	if (Subtarget.is64Bit())
328	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
329	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
330	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
331	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
332	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
333
334	setOperationAction(ISD::FREM , MVT::f32 , Expand);
335	setOperationAction(ISD::FREM , MVT::f64 , Expand);
336	setOperationAction(ISD::FREM , MVT::f80 , Expand);
337	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
338
339	// Promote the i8 variants and force them on up to i32 which has a shorter
340	// encoding.
341	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
342	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343	if (!Subtarget.hasBMI()) {
344	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
345	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
346	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
347	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
348	if (Subtarget.is64Bit()) {
349	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
350	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
351	}
352	}
353
354	if (Subtarget.hasLZCNT()) {
355	// When promoting the i8 variants, force them to i32 for a shorter
356	// encoding.
357	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
358	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
359	} else {
360	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
361	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
362	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
363	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
364	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
365	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
366	if (Subtarget.is64Bit()) {
367	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
368	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
369	}
370	}
371
372	// Special handling for half-precision floating point conversions.
373	// If we don't have F16C support, then lower half float conversions
374	// into library calls.
375	if (Subtarget.useSoftFloat() \|\|
376	(!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
377	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
378	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
379	}
380
381	// There's never any support for operations beyond MVT::f32.
382	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
383	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
384	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
385	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
386
387	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
388	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
389	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
390	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
391	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
392	setTruncStoreAction(MVT::f80, MVT::f16, Expand);
393
394	if (Subtarget.hasPOPCNT()) {
395	setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
396	} else {
397	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
398	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
399	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
400	if (Subtarget.is64Bit())
401	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
402	}
403
404	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
405
406	if (!Subtarget.hasMOVBE())
407	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
408
409	// These should be promoted to a larger select which is supported.
410	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
411	// X86 wants to expand cmov itself.
412	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
413	setOperationAction(ISD::SELECT, VT, Custom);
414	setOperationAction(ISD::SETCC, VT, Custom);
415	}
416	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417	if (VT == MVT::i64 && !Subtarget.is64Bit())
418	continue;
419	setOperationAction(ISD::SELECT, VT, Custom);
420	setOperationAction(ISD::SETCC, VT, Custom);
421	}
422	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
423	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
424	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
425	// support continuation, user-level threading, and etc.. As a result, no
426	// other SjLj exception interfaces are implemented and please don't build
427	// your own exception handling based on them.
428	// LLVM/Clang supports zero-cost DWARF exception handling.
429	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
430	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
431	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
432	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
433	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
434
435	// Darwin ABI issue.
436	for (auto VT : { MVT::i32, MVT::i64 }) {
437	if (VT == MVT::i64 && !Subtarget.is64Bit())
438	continue;
439	setOperationAction(ISD::ConstantPool , VT, Custom);
440	setOperationAction(ISD::JumpTable , VT, Custom);
441	setOperationAction(ISD::GlobalAddress , VT, Custom);
442	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
443	setOperationAction(ISD::ExternalSymbol , VT, Custom);
444	setOperationAction(ISD::BlockAddress , VT, Custom);
445	}
446
447	// 64-bit shl, sra, srl (iff 32-bit x86)
448	for (auto VT : { MVT::i32, MVT::i64 }) {
449	if (VT == MVT::i64 && !Subtarget.is64Bit())
450	continue;
451	setOperationAction(ISD::SHL_PARTS, VT, Custom);
452	setOperationAction(ISD::SRA_PARTS, VT, Custom);
453	setOperationAction(ISD::SRL_PARTS, VT, Custom);
454	}
455
456	if (Subtarget.hasSSE1())
457	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
458
459	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
460
461	// Expand certain atomics
462	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
463	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
464	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
465	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
466	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
467	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
468	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
469	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
470	}
471
472	if (Subtarget.hasCmpxchg16b()) {
473	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
474	}
475
476	// FIXME - use subtarget debug flags
477	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
478	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
479	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
480	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
481	}
482
483	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
484	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
485
486	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
487	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
488
489	setOperationAction(ISD::TRAP, MVT::Other, Legal);
490	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
491
492	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
493	setOperationAction(ISD::VASTART , MVT::Other, Custom);
494	setOperationAction(ISD::VAEND , MVT::Other, Expand);
495	bool Is64Bit = Subtarget.is64Bit();
496	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
497	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
498
499	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
500	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
501
502	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
503
504	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
505	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
506	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
507
508	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
509	// f32 and f64 use SSE.
510	// Set up the FP register classes.
511	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
512	: &X86::FR32RegClass);
513	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
514	: &X86::FR64RegClass);
515
516	for (auto VT : { MVT::f32, MVT::f64 }) {
517	// Use ANDPD to simulate FABS.
518	setOperationAction(ISD::FABS, VT, Custom);
519
520	// Use XORP to simulate FNEG.
521	setOperationAction(ISD::FNEG, VT, Custom);
522
523	// Use ANDPD and ORPD to simulate FCOPYSIGN.
524	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
525
526	// We don't support sin/cos/fmod
527	setOperationAction(ISD::FSIN , VT, Expand);
528	setOperationAction(ISD::FCOS , VT, Expand);
529	setOperationAction(ISD::FSINCOS, VT, Expand);
530	}
531
532	// Lower this to MOVMSK plus an AND.
533	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
534	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
535
536	// Expand FP immediates into loads from the stack, except for the special
537	// cases we handle.
538	addLegalFPImmediate(APFloat(+0.0)); // xorpd
539	addLegalFPImmediate(APFloat(+0.0f)); // xorps
540	} else if (UseX87 && X86ScalarSSEf32) {
541	// Use SSE for f32, x87 for f64.
542	// Set up the FP register classes.
543	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
544	: &X86::FR32RegClass);
545	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
546
547	// Use ANDPS to simulate FABS.
548	setOperationAction(ISD::FABS , MVT::f32, Custom);
549
550	// Use XORP to simulate FNEG.
551	setOperationAction(ISD::FNEG , MVT::f32, Custom);
552
553	setOperationAction(ISD::UNDEF, MVT::f64, Expand);
554
555	// Use ANDPS and ORPS to simulate FCOPYSIGN.
556	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
557	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
558
559	// We don't support sin/cos/fmod
560	setOperationAction(ISD::FSIN , MVT::f32, Expand);
561	setOperationAction(ISD::FCOS , MVT::f32, Expand);
562	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
563
564	// Special cases we handle for FP constants.
565	addLegalFPImmediate(APFloat(+0.0f)); // xorps
566	addLegalFPImmediate(APFloat(+0.0)); // FLD0
567	addLegalFPImmediate(APFloat(+1.0)); // FLD1
568	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
569	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
570
571	if (!TM.Options.UnsafeFPMath) {
572	setOperationAction(ISD::FSIN , MVT::f64, Expand);
573	setOperationAction(ISD::FCOS , MVT::f64, Expand);
574	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
575	}
576	} else if (UseX87) {
577	// f32 and f64 in x87.
578	// Set up the FP register classes.
579	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
580	addRegisterClass(MVT::f32, &X86::RFP32RegClass);
581
582	for (auto VT : { MVT::f32, MVT::f64 }) {
583	setOperationAction(ISD::UNDEF, VT, Expand);
584	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
585
586	if (!TM.Options.UnsafeFPMath) {
587	setOperationAction(ISD::FSIN , VT, Expand);
588	setOperationAction(ISD::FCOS , VT, Expand);
589	setOperationAction(ISD::FSINCOS, VT, Expand);
590	}
591	}
592	addLegalFPImmediate(APFloat(+0.0)); // FLD0
593	addLegalFPImmediate(APFloat(+1.0)); // FLD1
594	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
595	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
596	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
597	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
598	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
599	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
600	}
601
602	// We don't support FMA.
603	setOperationAction(ISD::FMA, MVT::f64, Expand);
604	setOperationAction(ISD::FMA, MVT::f32, Expand);
605
606	// Long double always uses X87, except f128 in MMX.
607	if (UseX87) {
608	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
609	addRegisterClass(MVT::f128, &X86::FR128RegClass);
610	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
611	setOperationAction(ISD::FABS , MVT::f128, Custom);
612	setOperationAction(ISD::FNEG , MVT::f128, Custom);
613	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
614	}
615
616	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
617	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
618	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
619	{
620	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
621	addLegalFPImmediate(TmpFlt); // FLD0
622	TmpFlt.changeSign();
623	addLegalFPImmediate(TmpFlt); // FLD0/FCHS
624
625	bool ignored;
626	APFloat TmpFlt2(+1.0);
627	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
628	&ignored);
629	addLegalFPImmediate(TmpFlt2); // FLD1
630	TmpFlt2.changeSign();
631	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
632	}
633
634	if (!TM.Options.UnsafeFPMath) {
635	setOperationAction(ISD::FSIN , MVT::f80, Expand);
636	setOperationAction(ISD::FCOS , MVT::f80, Expand);
637	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
638	}
639
640	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
641	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
642	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
643	setOperationAction(ISD::FRINT, MVT::f80, Expand);
644	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
645	setOperationAction(ISD::FMA, MVT::f80, Expand);
646	}
647
648	// Always use a library call for pow.
649	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
650	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
651	setOperationAction(ISD::FPOW , MVT::f80 , Expand);
652
653	setOperationAction(ISD::FLOG, MVT::f80, Expand);
654	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
655	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
656	setOperationAction(ISD::FEXP, MVT::f80, Expand);
657	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
658	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
659	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
660
661	// Some FP actions are always expanded for vector types.
662	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
663	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
664	setOperationAction(ISD::FSIN, VT, Expand);
665	setOperationAction(ISD::FSINCOS, VT, Expand);
666	setOperationAction(ISD::FCOS, VT, Expand);
667	setOperationAction(ISD::FREM, VT, Expand);
668	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
669	setOperationAction(ISD::FPOW, VT, Expand);
670	setOperationAction(ISD::FLOG, VT, Expand);
671	setOperationAction(ISD::FLOG2, VT, Expand);
672	setOperationAction(ISD::FLOG10, VT, Expand);
673	setOperationAction(ISD::FEXP, VT, Expand);
674	setOperationAction(ISD::FEXP2, VT, Expand);
675	}
676
677	// First set operation action for all vector types to either promote
678	// (for widening) or expand (for scalarization). Then we will selectively
679	// turn on ones that can be effectively codegen'd.
680	for (MVT VT : MVT::vector_valuetypes()) {
681	setOperationAction(ISD::SDIV, VT, Expand);
682	setOperationAction(ISD::UDIV, VT, Expand);
683	setOperationAction(ISD::SREM, VT, Expand);
684	setOperationAction(ISD::UREM, VT, Expand);
685	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
686	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
687	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
688	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
689	setOperationAction(ISD::FMA, VT, Expand);
690	setOperationAction(ISD::FFLOOR, VT, Expand);
691	setOperationAction(ISD::FCEIL, VT, Expand);
692	setOperationAction(ISD::FTRUNC, VT, Expand);
693	setOperationAction(ISD::FRINT, VT, Expand);
694	setOperationAction(ISD::FNEARBYINT, VT, Expand);
695	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
696	setOperationAction(ISD::MULHS, VT, Expand);
697	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
698	setOperationAction(ISD::MULHU, VT, Expand);
699	setOperationAction(ISD::SDIVREM, VT, Expand);
700	setOperationAction(ISD::UDIVREM, VT, Expand);
701	setOperationAction(ISD::CTPOP, VT, Expand);
702	setOperationAction(ISD::CTTZ, VT, Expand);
703	setOperationAction(ISD::CTLZ, VT, Expand);
704	setOperationAction(ISD::ROTL, VT, Expand);
705	setOperationAction(ISD::ROTR, VT, Expand);
706	setOperationAction(ISD::BSWAP, VT, Expand);
707	setOperationAction(ISD::SETCC, VT, Expand);
708	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
709	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
710	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
711	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
712	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
713	setOperationAction(ISD::TRUNCATE, VT, Expand);
714	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
715	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
716	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
717	setOperationAction(ISD::SELECT_CC, VT, Expand);
718	for (MVT InnerVT : MVT::vector_valuetypes()) {
719	setTruncStoreAction(InnerVT, VT, Expand);
720
721	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
722	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
723
724	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
725	// types, we have to deal with them whether we ask for Expansion or not.
726	// Setting Expand causes its own optimisation problems though, so leave
727	// them legal.
728	if (VT.getVectorElementType() == MVT::i1)
729	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
730
731	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
732	// split/scalarized right now.
733	if (VT.getVectorElementType() == MVT::f16)
734	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735	}
736	}
737
738	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
739	// with -msoft-float, disable use of MMX as well.
740	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
741	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
742	// No operations on x86mmx supported, everything uses intrinsics.
743	}
744
745	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
746	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
747	: &X86::VR128RegClass);
748
749	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
750	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
751	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
752	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
753	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
754	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
755	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
756	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
757	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
758	}
759
760	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
761	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
762	: &X86::VR128RegClass);
763
764	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
765	// registers cannot be used even for integer operations.
766	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
767	: &X86::VR128RegClass);
768	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
769	: &X86::VR128RegClass);
770	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
771	: &X86::VR128RegClass);
772	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
773	: &X86::VR128RegClass);
774
775	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
776	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
777	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
778	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
779	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
780	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
781	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
782	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
783	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
784	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
785	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
786	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
787	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
788
789	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
790	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
791	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
792	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
793
794	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
795	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
796	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
797
798	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
799	setOperationAction(ISD::SETCC, VT, Custom);
800	setOperationAction(ISD::CTPOP, VT, Custom);
801	setOperationAction(ISD::CTTZ, VT, Custom);
802	}
803
804	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
805	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
806	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
807	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
808	setOperationAction(ISD::VSELECT, VT, Custom);
809	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
810	}
811
812	// We support custom legalizing of sext and anyext loads for specific
813	// memory vector types which we can load as a scalar (or sequence of
814	// scalars) and extend in-register to a legal 128-bit vector type. For sext
815	// loads these must work with a single scalar load.
816	for (MVT VT : MVT::integer_vector_valuetypes()) {
817	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
818	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
819	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
820	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
821	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
822	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
823	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
824	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
825	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
826	}
827
828	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
829	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
830	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
831	setOperationAction(ISD::VSELECT, VT, Custom);
832
833	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
834	continue;
835
836	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
837	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
838	}
839
840	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
841	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
842	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
843	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
844	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
845	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
846	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
847	}
848
849	// Custom lower v2i64 and v2f64 selects.
850	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
851	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
852
853	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
854	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
855
856	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
857	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
858
859	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
860	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
861	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
862
863	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
864	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
865
866	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
867	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
868
869	for (MVT VT : MVT::fp_vector_valuetypes())
870	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
871
872	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
873	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
874	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
875
876	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
877	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
878	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
879
880	// In the customized shift lowering, the legal v4i32/v2i64 cases
881	// in AVX2 will be recognized.
882	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
883	setOperationAction(ISD::SRL, VT, Custom);
884	setOperationAction(ISD::SHL, VT, Custom);
885	setOperationAction(ISD::SRA, VT, Custom);
886	}
887	}
888
889	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
890	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
891	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
892	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
893	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
894	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
895	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
896	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
897	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898	}
899
900	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
901	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
902	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
903	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
904	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
905	setOperationAction(ISD::FRINT, RoundedTy, Legal);
906	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907	}
908
909	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
910	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
911	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
912	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
913	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
914	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
915	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
916	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
917
918	// FIXME: Do we need to handle scalar-to-vector here?
919	setOperationAction(ISD::MUL, MVT::v4i32, Legal);
920
921	// We directly match byte blends in the backend as they match the VSELECT
922	// condition form.
923	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
924
925	// SSE41 brings specific instructions for doing vector sign extend even in
926	// cases where we don't have SRA.
927	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
928	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
929	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
930	}
931
932	for (MVT VT : MVT::integer_vector_valuetypes()) {
933	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
934	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
935	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
936	}
937
938	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
939	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
940	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
941	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
942	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
943	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
944	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
945	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
946	}
947
948	// i8 vectors are custom because the source register and source
949	// source memory operand types are not the same width.
950	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951	}
952
953	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
954	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
955	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
956	setOperationAction(ISD::ROTL, VT, Custom);
957
958	// XOP can efficiently perform BITREVERSE with VPPERM.
959	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
960	setOperationAction(ISD::BITREVERSE, VT, Custom);
961
962	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
963	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
964	setOperationAction(ISD::BITREVERSE, VT, Custom);
965	}
966
967	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
968	bool HasInt256 = Subtarget.hasInt256();
969
970	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
971	: &X86::VR256RegClass);
972	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
973	: &X86::VR256RegClass);
974	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975	: &X86::VR256RegClass);
976	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
977	: &X86::VR256RegClass);
978	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979	: &X86::VR256RegClass);
980	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
981	: &X86::VR256RegClass);
982
983	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
984	setOperationAction(ISD::FFLOOR, VT, Legal);
985	setOperationAction(ISD::FCEIL, VT, Legal);
986	setOperationAction(ISD::FTRUNC, VT, Legal);
987	setOperationAction(ISD::FRINT, VT, Legal);
988	setOperationAction(ISD::FNEARBYINT, VT, Legal);
989	setOperationAction(ISD::FNEG, VT, Custom);
990	setOperationAction(ISD::FABS, VT, Custom);
991	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992	}
993
994	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
995	// even though v8i16 is a legal type.
996	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
997	setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
998	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
999
1000	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1001	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1002	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1003
1004	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1005	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1006
1007	for (MVT VT : MVT::fp_vector_valuetypes())
1008	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009
1010	// In the customized shift lowering, the legal v8i32/v4i64 cases
1011	// in AVX2 will be recognized.
1012	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1013	setOperationAction(ISD::SRL, VT, Custom);
1014	setOperationAction(ISD::SHL, VT, Custom);
1015	setOperationAction(ISD::SRA, VT, Custom);
1016	}
1017
1018	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1019	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1020	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021
1022	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1023	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1024	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1025	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1026	}
1027
1028	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1029	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1030	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1031	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032
1033	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1034	setOperationAction(ISD::SETCC, VT, Custom);
1035	setOperationAction(ISD::CTPOP, VT, Custom);
1036	setOperationAction(ISD::CTTZ, VT, Custom);
1037	setOperationAction(ISD::CTLZ, VT, Custom);
1038	}
1039
1040	if (Subtarget.hasAnyFMA()) {
1041	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1042	MVT::v2f64, MVT::v4f64 })
1043	setOperationAction(ISD::FMA, VT, Legal);
1044	}
1045
1046	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1047	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1048	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1049	}
1050
1051	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1052	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1053	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1054	setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055
1056	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1057	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058
1059	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1060	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1061	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1062	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063
1064	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1065	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1066	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1067	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1068	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1069	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1070	}
1071
1072	if (HasInt256) {
1073	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1074	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1075	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076
1077	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1078	// when we have a 256bit-wide blend with immediate.
1079	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080
1081	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1082	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1083	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1084	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1085	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1086	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1087	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1088	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1089	}
1090	}
1091
1092	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1093	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1094	setOperationAction(ISD::MLOAD, VT, Legal);
1095	setOperationAction(ISD::MSTORE, VT, Legal);
1096	}
1097
1098	// Extract subvector is special because the value type
1099	// (result) is 128-bit but the source is 256-bit wide.
1100	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1101	MVT::v4f32, MVT::v2f64 }) {
1102	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1103	}
1104
1105	// Custom lower several nodes for 256-bit types.
1106	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1107	MVT::v8f32, MVT::v4f64 }) {
1108	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1109	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1110	setOperationAction(ISD::VSELECT, VT, Custom);
1111	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1112	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1113	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1114	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1115	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1116	}
1117
1118	if (HasInt256)
1119	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120
1121	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1122	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1123	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1124	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1125	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1126	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1127	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1128	}
1129	}
1130
1131	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1132	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1133	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1134	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1135	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1136
1137	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1138	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1139	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1140
1141	for (MVT VT : MVT::fp_vector_valuetypes())
1142	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1143
1144	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1145	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1146	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1147	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1148	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1149	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1150	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1151	}
1152
1153	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1154	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1155	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1156	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1157	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1158	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1159	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1160	setTruncStoreAction(VT, MaskVT, Custom);
1161	}
1162
1163	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1164	setOperationAction(ISD::FNEG, VT, Custom);
1165	setOperationAction(ISD::FABS, VT, Custom);
1166	setOperationAction(ISD::FMA, VT, Legal);
1167	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1168	}
1169
1170	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1171	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1172	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1173	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1174	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1175	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1176	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1177	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1178	setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1179	setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1180	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1181	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1182	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1183	setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1184	setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1185	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1186	setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1187	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1188	setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1189	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1190	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1191	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1192	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1193	setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1194	setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1195
1196	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1197	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1198	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1199	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1200	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1201	if (Subtarget.hasVLX()){
1202	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1203	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1204	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1205	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1206	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1207
1208	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1209	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1210	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1211	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1212	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1213	} else {
1214	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1215	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1216	setOperationAction(ISD::MLOAD, VT, Custom);
1217	setOperationAction(ISD::MSTORE, VT, Custom);
1218	}
1219	}
1220	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1221	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1222
1223	if (Subtarget.hasDQI()) {
1224	for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1225	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1226	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1227	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1228	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1229	}
1230	if (Subtarget.hasVLX()) {
1231	// Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1232	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1233	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1234	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1235	}
1236	}
1237	if (Subtarget.hasVLX()) {
1238	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1239	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1240	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1241	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1242	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1243	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1244	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1245	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1246	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1247	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1248	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1249
1250	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1251	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1252	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1253	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1254	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1255	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1256	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1257	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1258	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1259	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1260	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1261	}
1262
1263	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1264	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1265	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1266	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1267	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1268	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1269	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1270	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1271	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1272	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1273
1274	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1275	setOperationAction(ISD::FFLOOR, VT, Legal);
1276	setOperationAction(ISD::FCEIL, VT, Legal);
1277	setOperationAction(ISD::FTRUNC, VT, Legal);
1278	setOperationAction(ISD::FRINT, VT, Legal);
1279	setOperationAction(ISD::FNEARBYINT, VT, Legal);
1280	}
1281
1282	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1283	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1284
1285	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1286	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1287	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1288
1289	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1290	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1291	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1292	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1293	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1294
1295	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1296
1297	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1298	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1299	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1300	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1301	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1302	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1303
1304	setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1305
1306	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1307	setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1308	setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1309
1310	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1311	setOperationAction(ISD::ADD, VT, Custom);
1312	setOperationAction(ISD::SUB, VT, Custom);
1313	setOperationAction(ISD::MUL, VT, Custom);
1314	setOperationAction(ISD::SETCC, VT, Custom);
1315	setOperationAction(ISD::SELECT, VT, Custom);
1316	setOperationAction(ISD::TRUNCATE, VT, Custom);
1317
1318	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1319	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1320	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1321	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1322	setOperationAction(ISD::VSELECT, VT, Expand);
1323	}
1324
1325	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1326	setOperationAction(ISD::SMAX, VT, Legal);
1327	setOperationAction(ISD::UMAX, VT, Legal);
1328	setOperationAction(ISD::SMIN, VT, Legal);
1329	setOperationAction(ISD::UMIN, VT, Legal);
1330	setOperationAction(ISD::ABS, VT, Legal);
1331	setOperationAction(ISD::SRL, VT, Custom);
1332	setOperationAction(ISD::SHL, VT, Custom);
1333	setOperationAction(ISD::SRA, VT, Custom);
1334	setOperationAction(ISD::CTPOP, VT, Custom);
1335	setOperationAction(ISD::CTTZ, VT, Custom);
1336	}
1337
1338	// Need to promote to 64-bit even though we have 32-bit masked instructions
1339	// because the IR optimizers rearrange bitcasts around logic ops leaving
1340	// too many variations to handle if we don't promote them.
1341	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1342	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1343	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1344
1345	if (Subtarget.hasCDI()) {
1346	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1347	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1348	MVT::v4i64, MVT::v8i64}) {
1349	setOperationAction(ISD::CTLZ, VT, Legal);
1350	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1351	}
1352	} // Subtarget.hasCDI()
1353
1354	if (Subtarget.hasDQI()) {
1355	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1356	setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1357	setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1358	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1359	}
1360
1361	if (Subtarget.hasVPOPCNTDQ()) {
1362	// VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1363	// version of popcntd/q.
1364	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1365	MVT::v4i32, MVT::v2i64})
1366	setOperationAction(ISD::CTPOP, VT, Legal);
1367	}
1368
1369	// Custom lower several nodes.
1370	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1371	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1372	setOperationAction(ISD::MGATHER, VT, Custom);
1373	setOperationAction(ISD::MSCATTER, VT, Custom);
1374	}
1375	// Extract subvector is special because the value type
1376	// (result) is 256-bit but the source is 512-bit wide.
1377	// 128-bit was made Custom under AVX1.
1378	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1379	MVT::v8f32, MVT::v4f64 })
1380	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1381	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1382	MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1383	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1384
1385	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1386	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1387	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1388	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1389	setOperationAction(ISD::VSELECT, VT, Custom);
1390	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1391	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1392	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1393	setOperationAction(ISD::MLOAD, VT, Legal);
1394	setOperationAction(ISD::MSTORE, VT, Legal);
1395	setOperationAction(ISD::MGATHER, VT, Legal);
1396	setOperationAction(ISD::MSCATTER, VT, Custom);
1397	}
1398	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1399	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1400	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1401	}
1402	}// has AVX-512
1403
1404	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1405	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1406	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1407
1408	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1409	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1410
1411	setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1412	setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1413	setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1414	setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1415	setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1416	setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1417
1418	setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1419	setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1420	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1421	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1422	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1423	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1424	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1425	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1426	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1427	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1428	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1429	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1430	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1431	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1432	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1433	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1434	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1435	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1436	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1437	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1438	setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1439	setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1440	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1441	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1442	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1443	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1444	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1445	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1446	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1447	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1448	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1449	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1450	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1451	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1452	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1453	setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1454	setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1455	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1456	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1457	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1458	setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1459	setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1460	setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1461	setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1462	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1463
1464	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1465
1466	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1467	if (Subtarget.hasVLX()) {
1468	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1469	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1470	}
1471
1472	LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1473	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1474	setOperationAction(ISD::MLOAD, VT, Action);
1475	setOperationAction(ISD::MSTORE, VT, Action);
1476	}
1477
1478	if (Subtarget.hasCDI()) {
1479	setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1480	setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1481	}
1482
1483	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1484	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1485	setOperationAction(ISD::VSELECT, VT, Custom);
1486	setOperationAction(ISD::ABS, VT, Legal);
1487	setOperationAction(ISD::SRL, VT, Custom);
1488	setOperationAction(ISD::SHL, VT, Custom);
1489	setOperationAction(ISD::SRA, VT, Custom);
1490	setOperationAction(ISD::MLOAD, VT, Legal);
1491	setOperationAction(ISD::MSTORE, VT, Legal);
1492	setOperationAction(ISD::CTPOP, VT, Custom);
1493	setOperationAction(ISD::CTTZ, VT, Custom);
1494	setOperationAction(ISD::SMAX, VT, Legal);
1495	setOperationAction(ISD::UMAX, VT, Legal);
1496	setOperationAction(ISD::SMIN, VT, Legal);
1497	setOperationAction(ISD::UMIN, VT, Legal);
1498
1499	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1500	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1501	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1502	}
1503
1504	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1505	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1506	if (Subtarget.hasVLX()) {
1507	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1508	setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1509	setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1510	}
1511	}
1512	}
1513
1514	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1515	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1516	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1517
1518	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1519	setOperationAction(ISD::ADD, VT, Custom);
1520	setOperationAction(ISD::SUB, VT, Custom);
1521	setOperationAction(ISD::MUL, VT, Custom);
1522	setOperationAction(ISD::VSELECT, VT, Expand);
1523
1524	setOperationAction(ISD::TRUNCATE, VT, Custom);
1525	setOperationAction(ISD::SETCC, VT, Custom);
1526	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1527	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1528	setOperationAction(ISD::SELECT, VT, Custom);
1529	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1530	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1531	}
1532
1533	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1534	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1535	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1536	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1537
1538	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1539	setOperationAction(ISD::SMAX, VT, Legal);
1540	setOperationAction(ISD::UMAX, VT, Legal);
1541	setOperationAction(ISD::SMIN, VT, Legal);
1542	setOperationAction(ISD::UMIN, VT, Legal);
1543	}
1544	}
1545
1546	// We want to custom lower some of our intrinsics.
1547	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1548	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1549	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1550	if (!Subtarget.is64Bit()) {
1551	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1552	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1553	}
1554
1555	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1556	// handle type legalization for these operations here.
1557	//
1558	// FIXME: We really should do custom legalization for addition and
1559	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1560	// than generic legalization for 64-bit multiplication-with-overflow, though.
1561	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1562	if (VT == MVT::i64 && !Subtarget.is64Bit())
1563	continue;
1564	// Add/Sub/Mul with overflow operations are custom lowered.
1565	setOperationAction(ISD::SADDO, VT, Custom);
1566	setOperationAction(ISD::UADDO, VT, Custom);
1567	setOperationAction(ISD::SSUBO, VT, Custom);
1568	setOperationAction(ISD::USUBO, VT, Custom);
1569	setOperationAction(ISD::SMULO, VT, Custom);
1570	setOperationAction(ISD::UMULO, VT, Custom);
1571
1572	// Support carry in as value rather than glue.
1573	setOperationAction(ISD::ADDCARRY, VT, Custom);
1574	setOperationAction(ISD::SUBCARRY, VT, Custom);
1575	setOperationAction(ISD::SETCCCARRY, VT, Custom);
1576	}
1577
1578	if (!Subtarget.is64Bit()) {
1579	// These libcalls are not available in 32-bit.
1580	setLibcallName(RTLIB::SHL_I128, nullptr);
1581	setLibcallName(RTLIB::SRL_I128, nullptr);
1582	setLibcallName(RTLIB::SRA_I128, nullptr);
1583	}
1584
1585	// Combine sin / cos into one node or libcall if possible.
1586	if (Subtarget.hasSinCos()) {
1587	setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1588	setLibcallName(RTLIB::SINCOS_F64, "sincos");
1589	if (Subtarget.isTargetDarwin()) {
1590	// For MacOSX, we don't want the normal expansion of a libcall to sincos.
1591	// We want to issue a libcall to __sincos_stret to avoid memory traffic.
1592	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1593	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1594	}
1595	}
1596
1597	if (Subtarget.isTargetWin64()) {
1598	setOperationAction(ISD::SDIV, MVT::i128, Custom);
1599	setOperationAction(ISD::UDIV, MVT::i128, Custom);
1600	setOperationAction(ISD::SREM, MVT::i128, Custom);
1601	setOperationAction(ISD::UREM, MVT::i128, Custom);
1602	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1603	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1604	}
1605
1606	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1607	// is. We should promote the value to 64-bits to solve this.
1608	// This is what the CRT headers do - `fmodf` is an inline header
1609	// function casting to f64 and calling `fmod`.
1610	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
1611	Subtarget.isTargetWindowsItanium()))
1612	for (ISD::NodeType Op :
1613	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1614	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1615	if (isOperationExpand(Op, MVT::f32))
1616	setOperationAction(Op, MVT::f32, Promote);
1617
1618	// We have target-specific dag combine patterns for the following nodes:
1619	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1620	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1621	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1622	setTargetDAGCombine(ISD::BITCAST);
1623	setTargetDAGCombine(ISD::VSELECT);
1624	setTargetDAGCombine(ISD::SELECT);
1625	setTargetDAGCombine(ISD::SHL);
1626	setTargetDAGCombine(ISD::SRA);
1627	setTargetDAGCombine(ISD::SRL);
1628	setTargetDAGCombine(ISD::OR);
1629	setTargetDAGCombine(ISD::AND);
1630	setTargetDAGCombine(ISD::ADD);
1631	setTargetDAGCombine(ISD::FADD);
1632	setTargetDAGCombine(ISD::FSUB);
1633	setTargetDAGCombine(ISD::FNEG);
1634	setTargetDAGCombine(ISD::FMA);
1635	setTargetDAGCombine(ISD::FMINNUM);
1636	setTargetDAGCombine(ISD::FMAXNUM);
1637	setTargetDAGCombine(ISD::SUB);
1638	setTargetDAGCombine(ISD::LOAD);
1639	setTargetDAGCombine(ISD::MLOAD);
1640	setTargetDAGCombine(ISD::STORE);
1641	setTargetDAGCombine(ISD::MSTORE);
1642	setTargetDAGCombine(ISD::TRUNCATE);
1643	setTargetDAGCombine(ISD::ZERO_EXTEND);
1644	setTargetDAGCombine(ISD::ANY_EXTEND);
1645	setTargetDAGCombine(ISD::SIGN_EXTEND);
1646	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1647	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1648	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1649	setTargetDAGCombine(ISD::SINT_TO_FP);
1650	setTargetDAGCombine(ISD::UINT_TO_FP);
1651	setTargetDAGCombine(ISD::SETCC);
1652	setTargetDAGCombine(ISD::MUL);
1653	setTargetDAGCombine(ISD::XOR);
1654	setTargetDAGCombine(ISD::MSCATTER);
1655	setTargetDAGCombine(ISD::MGATHER);
1656
1657	computeRegisterProperties(Subtarget.getRegisterInfo());
1658
1659	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1660	MaxStoresPerMemsetOptSize = 8;
1661	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1662	MaxStoresPerMemcpyOptSize = 4;
1663	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1664	MaxStoresPerMemmoveOptSize = 4;
1665
1666	// TODO: These control memcmp expansion in CGP and are set low to prevent
1667	// altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
1668	MaxLoadsPerMemcmp = 1;
1669	MaxLoadsPerMemcmpOptSize = 1;
1670
1671	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1672	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1673
1674	// An out-of-order CPU can speculatively execute past a predictable branch,
1675	// but a conditional move could be stalled by an expensive earlier operation.
1676	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1677	EnableExtLdPromotion = true;
1678	setPrefFunctionAlignment(4); // 2^4 bytes.
1679
1680	verifyIntrinsicTables();
1681	}
1682
1683	// This has so far only been implemented for 64-bit MachO.
1684	bool X86TargetLowering::useLoadStackGuardNode() const {
1685	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1686	}
1687
1688	TargetLoweringBase::LegalizeTypeAction
1689	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1690	if (ExperimentalVectorWideningLegalization &&
1691	VT.getVectorNumElements() != 1 &&
1692	VT.getVectorElementType().getSimpleVT() != MVT::i1)
1693	return TypeWidenVector;
1694
1695	return TargetLoweringBase::getPreferredVectorAction(VT);
1696	}
1697
1698	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1699	LLVMContext& Context,
1700	EVT VT) const {
1701	if (!VT.isVector())
1702	return MVT::i8;
1703
1704	if (VT.isSimple()) {
1705	MVT VVT = VT.getSimpleVT();
1706	const unsigned NumElts = VVT.getVectorNumElements();
1707	MVT EltVT = VVT.getVectorElementType();
1708	if (VVT.is512BitVector()) {
1709	if (Subtarget.hasAVX512())
1710	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64 \|\|
1711	EltVT == MVT::f32 \|\| EltVT == MVT::f64)
1712	switch(NumElts) {
1713	case 8: return MVT::v8i1;
1714	case 16: return MVT::v16i1;
1715	}
1716	if (Subtarget.hasBWI())
1717	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
1718	switch(NumElts) {
1719	case 32: return MVT::v32i1;
1720	case 64: return MVT::v64i1;
1721	}
1722	}
1723
1724	if (Subtarget.hasBWI() && Subtarget.hasVLX())
1725	return MVT::getVectorVT(MVT::i1, NumElts);
1726
1727	if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1728	EVT LegalVT = getTypeToTransformTo(Context, VT);
1729	EltVT = LegalVT.getVectorElementType().getSimpleVT();
1730	}
1731
1732	if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1733	switch(NumElts) {
1734	case 2: return MVT::v2i1;
1735	case 4: return MVT::v4i1;
1736	case 8: return MVT::v8i1;
1737	}
1738	}
1739
1740	return VT.changeVectorElementTypeToInteger();
1741	}
1742
1743	/// Helper for getByValTypeAlignment to determine
1744	/// the desired ByVal argument alignment.
1745	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1746	if (MaxAlign == 16)
1747	return;
1748	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1749	if (VTy->getBitWidth() == 128)
1750	MaxAlign = 16;
1751	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1752	unsigned EltAlign = 0;
1753	getMaxByValAlign(ATy->getElementType(), EltAlign);
1754	if (EltAlign > MaxAlign)
1755	MaxAlign = EltAlign;
1756	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1757	for (auto *EltTy : STy->elements()) {
1758	unsigned EltAlign = 0;
1759	getMaxByValAlign(EltTy, EltAlign);
1760	if (EltAlign > MaxAlign)
1761	MaxAlign = EltAlign;
1762	if (MaxAlign == 16)
1763	break;
1764	}
1765	}
1766	}
1767
1768	/// Return the desired alignment for ByVal aggregate
1769	/// function arguments in the caller parameter area. For X86, aggregates
1770	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1771	/// are at 4-byte boundaries.
1772	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1773	const DataLayout &DL) const {
1774	if (Subtarget.is64Bit()) {
1775	// Max of 8 and alignment of type.
1776	unsigned TyAlign = DL.getABITypeAlignment(Ty);
1777	if (TyAlign > 8)
1778	return TyAlign;
1779	return 8;
1780	}
1781
1782	unsigned Align = 4;
1783	if (Subtarget.hasSSE1())
1784	getMaxByValAlign(Ty, Align);
1785	return Align;
1786	}
1787
1788	/// Returns the target specific optimal type for load
1789	/// and store operations as a result of memset, memcpy, and memmove
1790	/// lowering. If DstAlign is zero that means it's safe to destination
1791	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1792	/// means there isn't a need to check it against alignment requirement,
1793	/// probably because the source does not need to be loaded. If 'IsMemset' is
1794	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1795	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1796	/// source is constant so it does not need to be loaded.
1797	/// It returns EVT::Other if the type should be determined using generic
1798	/// target-independent logic.
1799	EVT
1800	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1801	unsigned DstAlign, unsigned SrcAlign,
1802	bool IsMemset, bool ZeroMemset,
1803	bool MemcpyStrSrc,
1804	MachineFunction &MF) const {
1805	const Function *F = MF.getFunction();
1806	if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1807	if (Size >= 16 &&
1808	(!Subtarget.isUnalignedMem16Slow() \|\|
1809	((DstAlign == 0 \|\| DstAlign >= 16) &&
1810	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
1811	// FIXME: Check if unaligned 32-byte accesses are slow.
1812	if (Size >= 32 && Subtarget.hasAVX()) {
1813	// Although this isn't a well-supported type for AVX1, we'll let
1814	// legalization and shuffle lowering produce the optimal codegen. If we
1815	// choose an optimal type with a vector element larger than a byte,
1816	// getMemsetStores() may create an intermediate splat (using an integer
1817	// multiply) before we splat as a vector.
1818	return MVT::v32i8;
1819	}
1820	if (Subtarget.hasSSE2())
1821	return MVT::v16i8;
1822	// TODO: Can SSE1 handle a byte vector?
1823	if (Subtarget.hasSSE1())
1824	return MVT::v4f32;
1825	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1826	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1827	// Do not use f64 to lower memcpy if source is string constant. It's
1828	// better to use i32 to avoid the loads.
1829	// Also, do not use f64 to lower memset unless this is a memset of zeros.
1830	// The gymnastics of splatting a byte value into an XMM register and then
1831	// only using 8-byte stores (because this is a CPU with slow unaligned
1832	// 16-byte accesses) makes that a loser.
1833	return MVT::f64;
1834	}
1835	}
1836	// This is a compromise. If we reach here, unaligned accesses may be slow on
1837	// this target. However, creating smaller, aligned accesses could be even
1838	// slower and would certainly be a lot more code.
1839	if (Subtarget.is64Bit() && Size >= 8)
1840	return MVT::i64;
1841	return MVT::i32;
1842	}
1843
1844	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1845	if (VT == MVT::f32)
1846	return X86ScalarSSEf32;
1847	else if (VT == MVT::f64)
1848	return X86ScalarSSEf64;
1849	return true;
1850	}
1851
1852	bool
1853	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1854	unsigned,
1855	unsigned,
1856	bool *Fast) const {
1857	if (Fast) {
1858	switch (VT.getSizeInBits()) {
1859	default:
1860	// 8-byte and under are always assumed to be fast.
1861	*Fast = true;
1862	break;
1863	case 128:
1864	*Fast = !Subtarget.isUnalignedMem16Slow();
1865	break;
1866	case 256:
1867	*Fast = !Subtarget.isUnalignedMem32Slow();
1868	break;
1869	// TODO: What about AVX-512 (512-bit) accesses?
1870	}
1871	}
1872	// Misaligned accesses of any size are always allowed.
1873	return true;
1874	}
1875
1876	/// Return the entry encoding for a jump table in the
1877	/// current function. The returned value is a member of the
1878	/// MachineJumpTableInfo::JTEntryKind enum.
1879	unsigned X86TargetLowering::getJumpTableEncoding() const {
1880	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1881	// symbol.
1882	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1883	return MachineJumpTableInfo::EK_Custom32;
1884
1885	// Otherwise, use the normal jump table encoding heuristics.
1886	return TargetLowering::getJumpTableEncoding();
1887	}
1888
1889	bool X86TargetLowering::useSoftFloat() const {
1890	return Subtarget.useSoftFloat();
1891	}
1892
1893	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1894	ArgListTy &Args) const {
1895
1896	// Only relabel X86-32 for C / Stdcall CCs.
1897	if (Subtarget.is64Bit())
1898	return;
1899	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1900	return;
1901	unsigned ParamRegs = 0;
1902	if (auto *M = MF->getFunction()->getParent())
1903	ParamRegs = M->getNumberRegisterParameters();
1904
1905	// Mark the first N int arguments as having reg
1906	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1907	Type *T = Args[Idx].Ty;
1908	if (T->isPointerTy() \|\| T->isIntegerTy())
1909	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1910	unsigned numRegs = 1;
1911	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1912	numRegs = 2;
1913	if (ParamRegs < numRegs)
1914	return;
1915	ParamRegs -= numRegs;
1916	Args[Idx].IsInReg = true;
1917	}
1918	}
1919	}
1920
1921	const MCExpr *
1922	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1923	const MachineBasicBlock *MBB,
1924	unsigned uid,MCContext &Ctx) const{
1925	assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT( )) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 1925, __PRETTY_FUNCTION__));
1926	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1927	// entries.
1928	return MCSymbolRefExpr::create(MBB->getSymbol(),
1929	MCSymbolRefExpr::VK_GOTOFF, Ctx);
1930	}
1931
1932	/// Returns relocation base for the given PIC jumptable.
1933	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1934	SelectionDAG &DAG) const {
1935	if (!Subtarget.is64Bit())
1936	// This doesn't have SDLoc associated with it, but is not really the
1937	// same as a Register.
1938	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1939	getPointerTy(DAG.getDataLayout()));
1940	return Table;
1941	}
1942
1943	/// This returns the relocation base for the given PIC jumptable,
1944	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
1945	const MCExpr *X86TargetLowering::
1946	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1947	MCContext &Ctx) const {
1948	// X86-64 uses RIP relative addressing based on the jump table label.
1949	if (Subtarget.isPICStyleRIPRel())
1950	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1951
1952	// Otherwise, the reference is relative to the PIC base.
1953	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1954	}
1955
1956	std::pair<const TargetRegisterClass *, uint8_t>
1957	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1958	MVT VT) const {
1959	const TargetRegisterClass *RRC = nullptr;
1960	uint8_t Cost = 1;
1961	switch (VT.SimpleTy) {
1962	default:
1963	return TargetLowering::findRepresentativeClass(TRI, VT);
1964	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1965	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1966	break;
1967	case MVT::x86mmx:
1968	RRC = &X86::VR64RegClass;
1969	break;
1970	case MVT::f32: case MVT::f64:
1971	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1972	case MVT::v4f32: case MVT::v2f64:
1973	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1974	case MVT::v8f32: case MVT::v4f64:
1975	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1976	case MVT::v16f32: case MVT::v8f64:
1977	RRC = &X86::VR128XRegClass;
1978	break;
1979	}
1980	return std::make_pair(RRC, Cost);
1981	}
1982
1983	unsigned X86TargetLowering::getAddressSpace() const {
1984	if (Subtarget.is64Bit())
1985	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1986	return 256;
1987	}
1988
1989	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1990	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
1991	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1992	}
1993
1994	static Constant* SegmentOffset(IRBuilder<> &IRB,
1995	unsigned Offset, unsigned AddressSpace) {
1996	return ConstantExpr::getIntToPtr(
1997	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1998	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1999	}
2000
2001	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2002	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
2003	// tcbhead_t; use it instead of the usual global variable (see
2004	// sysdeps/{i386,x86_64}/nptl/tls.h)
2005	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2006	if (Subtarget.isTargetFuchsia()) {
2007	// <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2008	return SegmentOffset(IRB, 0x10, getAddressSpace());
2009	} else {
2010	// %fs:0x28, unless we're using a Kernel code model, in which case
2011	// it's %gs:0x28. gs:0x14 on i386.
2012	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2013	return SegmentOffset(IRB, Offset, getAddressSpace());
2014	}
2015	}
2016
2017	return TargetLowering::getIRStackGuard(IRB);
2018	}
2019
2020	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2021	// MSVC CRT provides functionalities for stack protection.
2022	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2023	// MSVC CRT has a global variable holding security cookie.
2024	M.getOrInsertGlobal("__security_cookie",
2025	Type::getInt8PtrTy(M.getContext()));
2026
2027	// MSVC CRT has a function to validate security cookie.
2028	auto *SecurityCheckCookie = cast<Function>(
2029	M.getOrInsertFunction("__security_check_cookie",
2030	Type::getVoidTy(M.getContext()),
2031	Type::getInt8PtrTy(M.getContext())));
2032	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2033	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2034	return;
2035	}
2036	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
2037	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2038	return;
2039	TargetLowering::insertSSPDeclarations(M);
2040	}
2041
2042	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2043	// MSVC CRT has a global variable holding security cookie.
2044	if (Subtarget.getTargetTriple().isOSMSVCRT())
2045	return M.getGlobalVariable("__security_cookie");
2046	return TargetLowering::getSDagStackGuard(M);
2047	}
2048
2049	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2050	// MSVC CRT has a function to validate security cookie.
2051	if (Subtarget.getTargetTriple().isOSMSVCRT())
2052	return M.getFunction("__security_check_cookie");
2053	return TargetLowering::getSSPStackGuardCheck(M);
2054	}
2055
2056	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2057	if (Subtarget.getTargetTriple().isOSContiki())
2058	return getDefaultSafeStackPointerLocation(IRB, false);
2059
2060	// Android provides a fixed TLS slot for the SafeStack pointer. See the
2061	// definition of TLS_SLOT_SAFESTACK in
2062	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2063	if (Subtarget.isTargetAndroid()) {
2064	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2065	// %gs:0x24 on i386
2066	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2067	return SegmentOffset(IRB, Offset, getAddressSpace());
2068	}
2069
2070	// Fuchsia is similar.
2071	if (Subtarget.isTargetFuchsia()) {
2072	// <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2073	return SegmentOffset(IRB, 0x18, getAddressSpace());
2074	}
2075
2076	return TargetLowering::getSafeStackPointerLocation(IRB);
2077	}
2078
2079	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2080	unsigned DestAS) const {
2081	assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!" ) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2081, __PRETTY_FUNCTION__));
2082
2083	return SrcAS < 256 && DestAS < 256;
2084	}
2085
2086	//===----------------------------------------------------------------------===//
2087	// Return Value Calling Convention Implementation
2088	//===----------------------------------------------------------------------===//
2089
2090	#include "X86GenCallingConv.inc"
2091
2092	bool X86TargetLowering::CanLowerReturn(
2093	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2094	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2095	SmallVector<CCValAssign, 16> RVLocs;
2096	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2097	return CCInfo.CheckReturn(Outs, RetCC_X86);
2098	}
2099
2100	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2101	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2102	return ScratchRegs;
2103	}
2104
2105	/// Lowers masks values (v*i1) to the local register values
2106	/// \returns DAG node after lowering to register type
2107	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2108	const SDLoc &Dl, SelectionDAG &DAG) {
2109	EVT ValVT = ValArg.getValueType();
2110
2111	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
2112	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
2113	// Two stage lowering might be required
2114	// bitcast: v8i1 -> i8 / v16i1 -> i16
2115	// anyextend: i8 -> i32 / i16 -> i32
2116	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2117	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2118	if (ValLoc == MVT::i32)
2119	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2120	return ValToCopy;
2121	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
2122	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2123	// One stage lowering is required
2124	// bitcast: v32i1 -> i32 / v64i1 -> i64
2125	return DAG.getBitcast(ValLoc, ValArg);
2126	} else
2127	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2128	}
2129
2130	/// Breaks v64i1 value into two registers and adds the new node to the DAG
2131	static void Passv64i1ArgInRegs(
2132	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2133	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2134	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2135	assert((Subtarget.hasBWI() \|\| Subtarget.hasBMI()) &&(((Subtarget.hasBWI() \|\| Subtarget.hasBMI()) && "Expected AVX512BW or AVX512BMI target!" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| Subtarget.hasBMI()) && \"Expected AVX512BW or AVX512BMI target!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2136, __PRETTY_FUNCTION__))
2136	"Expected AVX512BW or AVX512BMI target!")(((Subtarget.hasBWI() \|\| Subtarget.hasBMI()) && "Expected AVX512BW or AVX512BMI target!" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| Subtarget.hasBMI()) && \"Expected AVX512BW or AVX512BMI target!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2136, __PRETTY_FUNCTION__));
2137	assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2137, __PRETTY_FUNCTION__));
2138	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value" ) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2138, __PRETTY_FUNCTION__));
2139	assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers" ) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2140, __PRETTY_FUNCTION__))
2140	"The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers" ) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2140, __PRETTY_FUNCTION__));
2141
2142	// Before splitting the value we cast it to i64
2143	Arg = DAG.getBitcast(MVT::i64, Arg);
2144
2145	// Splitting the value into two i32 types
2146	SDValue Lo, Hi;
2147	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148	DAG.getConstant(0, Dl, MVT::i32));
2149	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2150	DAG.getConstant(1, Dl, MVT::i32));
2151
2152	// Attach the two i32 types into corresponding registers
2153	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2154	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2155	}
2156
2157	SDValue
2158	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2159	bool isVarArg,
2160	const SmallVectorImpl<ISD::OutputArg> &Outs,
2161	const SmallVectorImpl<SDValue> &OutVals,
2162	const SDLoc &dl, SelectionDAG &DAG) const {
2163	MachineFunction &MF = DAG.getMachineFunction();
2164	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2165
2166	// In some cases we need to disable registers from the default CSR list.
2167	// For example, when they are used for argument passing.
2168	bool ShouldDisableCalleeSavedRegister =
2169	CallConv == CallingConv::X86_RegCall \|\|
2170	MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2171
2172	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2173	report_fatal_error("X86 interrupts may not return any value");
2174
2175	SmallVector<CCValAssign, 16> RVLocs;
2176	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2177	CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2178
2179	SDValue Flag;
2180	SmallVector<SDValue, 6> RetOps;
2181	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2182	// Operand #1 = Bytes To Pop
2183	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2184	MVT::i32));
2185
2186	// Copy the result values into the output registers.
2187	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2188	++I, ++OutsIndex) {
2189	CCValAssign &VA = RVLocs[I];
2190	assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2190, __PRETTY_FUNCTION__));
2191
2192	// Add the register to the CalleeSaveDisableRegs list.
2193	if (ShouldDisableCalleeSavedRegister)
2194	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2195
2196	SDValue ValToCopy = OutVals[OutsIndex];
2197	EVT ValVT = ValToCopy.getValueType();
2198
2199	// Promote values to the appropriate types.
2200	if (VA.getLocInfo() == CCValAssign::SExt)
2201	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2202	else if (VA.getLocInfo() == CCValAssign::ZExt)
2203	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2204	else if (VA.getLocInfo() == CCValAssign::AExt) {
2205	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2206	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2207	else
2208	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2209	}
2210	else if (VA.getLocInfo() == CCValAssign::BCvt)
2211	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2212
2213	assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value." ) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2214, __PRETTY_FUNCTION__))
2214	"Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value." ) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2214, __PRETTY_FUNCTION__));
2215
2216	// If this is x86-64, and we disabled SSE, we can't return FP values,
2217	// or SSE or MMX vectors.
2218	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
2219	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
2220	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2221	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2222	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2223	} else if (ValVT == MVT::f64 &&
2224	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2225	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
2226	// llvm-gcc has never done it right and no one has noticed, so this
2227	// should be OK for now.
2228	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2229	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2230	}
2231
2232	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
2233	// the RET instruction and handled by the FP Stackifier.
2234	if (VA.getLocReg() == X86::FP0 \|\|
2235	VA.getLocReg() == X86::FP1) {
2236	// If this is a copy from an xmm register to ST(0), use an FPExtend to
2237	// change the value to the FP stack register class.
2238	if (isScalarFPTypeInSSEReg(VA.getValVT()))
2239	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2240	RetOps.push_back(ValToCopy);
2241	// Don't emit a copytoreg.
2242	continue;
2243	}
2244
2245	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2246	// which is returned in RAX / RDX.
2247	if (Subtarget.is64Bit()) {
2248	if (ValVT == MVT::x86mmx) {
2249	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
2250	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2251	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2252	ValToCopy);
2253	// If we don't have SSE2 available, convert to v4f32 so the generated
2254	// register is legal.
2255	if (!Subtarget.hasSSE2())
2256	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2257	}
2258	}
2259	}
2260
2261	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2262
2263	if (VA.needsCustom()) {
2264	assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2265, __PRETTY_FUNCTION__))
2265	"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2265, __PRETTY_FUNCTION__));
2266
2267	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2268	Subtarget);
2269
2270	assert(2 == RegsToPass.size() &&((2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs" ) ? static_cast<void> (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2271, __PRETTY_FUNCTION__))
2271	"Expecting two registers after Pass64BitArgInRegs")((2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs" ) ? static_cast<void> (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2271, __PRETTY_FUNCTION__));
2272
2273	// Add the second register to the CalleeSaveDisableRegs list.
2274	if (ShouldDisableCalleeSavedRegister)
2275	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2276	} else {
2277	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2278	}
2279
2280	// Add nodes to the DAG and add the values into the RetOps list
2281	for (auto &Reg : RegsToPass) {
2282	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2283	Flag = Chain.getValue(1);
2284	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2285	}
2286	}
2287
2288	// Swift calling convention does not require we copy the sret argument
2289	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2290
2291	// All x86 ABIs require that for returning structs by value we copy
2292	// the sret argument into %rax/%eax (depending on ABI) for the return.
2293	// We saved the argument into a virtual register in the entry block,
2294	// so now we copy the value out and into %rax/%eax.
2295	//
2296	// Checking Function.hasStructRetAttr() here is insufficient because the IR
2297	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2298	// false, then an sret argument may be implicitly inserted in the SelDAG. In
2299	// either case FuncInfo->setSRetReturnReg() will have been called.
2300	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2301	// When we have both sret and another return value, we should use the
2302	// original Chain stored in RetOps[0], instead of the current Chain updated
2303	// in the above loop. If we only have sret, RetOps[0] equals to Chain.
2304
2305	// For the case of sret and another return value, we have
2306	// Chain_0 at the function entry
2307	// Chain_1 = getCopyToReg(Chain_0) in the above loop
2308	// If we use Chain_1 in getCopyFromReg, we will have
2309	// Val = getCopyFromReg(Chain_1)
2310	// Chain_2 = getCopyToReg(Chain_1, Val) from below
2311
2312	// getCopyToReg(Chain_0) will be glued together with
2313	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2314	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2315	// Data dependency from Unit B to Unit A due to usage of Val in
2316	// getCopyToReg(Chain_1, Val)
2317	// Chain dependency from Unit A to Unit B
2318
2319	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2320	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2321	getPointerTy(MF.getDataLayout()));
2322
2323	unsigned RetValReg
2324	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2325	X86::RAX : X86::EAX;
2326	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2327	Flag = Chain.getValue(1);
2328
2329	// RAX/EAX now acts like a return value.
2330	RetOps.push_back(
2331	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2332
2333	// Add the returned register to the CalleeSaveDisableRegs list.
2334	if (ShouldDisableCalleeSavedRegister)
2335	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2336	}
2337
2338	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2339	const MCPhysReg *I =
2340	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2341	if (I) {
2342	for (; *I; ++I) {
2343	if (X86::GR64RegClass.contains(*I))
2344	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2345	else
2346	llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2346);
2347	}
2348	}
2349
2350	RetOps[0] = Chain; // Update chain.
2351
2352	// Add the flag if we have it.
2353	if (Flag.getNode())
2354	RetOps.push_back(Flag);
2355
2356	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2357	if (CallConv == CallingConv::X86_INTR)
2358	opcode = X86ISD::IRET;
2359	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2360	}
2361
2362	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2363	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
2364	return false;
2365
2366	SDValue TCChain = Chain;
2367	SDNode Copy = N->use_begin();
2368	if (Copy->getOpcode() == ISD::CopyToReg) {
2369	// If the copy has a glue operand, we conservatively assume it isn't safe to
2370	// perform a tail call.
2371	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2372	return false;
2373	TCChain = Copy->getOperand(0);
2374	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
2375	return false;
2376
2377	bool HasRet = false;
2378	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2379	UI != UE; ++UI) {
2380	if (UI->getOpcode() != X86ISD::RET_FLAG)
2381	return false;
2382	// If we are returning more than one value, we can definitely
2383	// not make a tail call see PR19530
2384	if (UI->getNumOperands() > 4)
2385	return false;
2386	if (UI->getNumOperands() == 4 &&
2387	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2388	return false;
2389	HasRet = true;
2390	}
2391
2392	if (!HasRet)
2393	return false;
2394
2395	Chain = TCChain;
2396	return true;
2397	}
2398
2399	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2400	ISD::NodeType ExtendKind) const {
2401	MVT ReturnMVT = MVT::i32;
2402
2403	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2404	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
2405	// The ABI does not require i1, i8 or i16 to be extended.
2406	//
2407	// On Darwin, there is code in the wild relying on Clang's old behaviour of
2408	// always extending i8/i16 return values, so keep doing that for now.
2409	// (PR26665).
2410	ReturnMVT = MVT::i8;
2411	}
2412
2413	EVT MinVT = getRegisterType(Context, ReturnMVT);
2414	return VT.bitsLT(MinVT) ? MinVT : VT;
2415	}
2416
2417	/// Reads two 32 bit registers and creates a 64 bit mask value.
2418	/// \param VA The current 32 bit value that need to be assigned.
2419	/// \param NextVA The next 32 bit value that need to be assigned.
2420	/// \param Root The parent DAG node.
2421	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2422	/// glue purposes. In the case the DAG is already using
2423	/// physical register instead of virtual, we should glue
2424	/// our new SDValue to InFlag SDvalue.
2425	/// \return a new SDvalue of size 64bit.
2426	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2427	SDValue &Root, SelectionDAG &DAG,
2428	const SDLoc &Dl, const X86Subtarget &Subtarget,
2429	SDValue *InFlag = nullptr) {
2430	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2430, __PRETTY_FUNCTION__));
2431	assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2431, __PRETTY_FUNCTION__));
2432	assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2433, __PRETTY_FUNCTION__))
2433	"Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2433, __PRETTY_FUNCTION__));
2434	assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type" ) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2435, __PRETTY_FUNCTION__))
2435	"The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type" ) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2435, __PRETTY_FUNCTION__));
2436	assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers" ) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2437, __PRETTY_FUNCTION__))
2437	"The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers" ) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2437, __PRETTY_FUNCTION__));
2438
2439	SDValue Lo, Hi;
2440	unsigned Reg;
2441	SDValue ArgValueLo, ArgValueHi;
2442
2443	MachineFunction &MF = DAG.getMachineFunction();
2444	const TargetRegisterClass *RC = &X86::GR32RegClass;
2445
2446	// Read a 32 bit value from the registers
2447	if (nullptr == InFlag) {
2448	// When no physical register is present,
2449	// create an intermediate virtual register
2450	Reg = MF.addLiveIn(VA.getLocReg(), RC);
2451	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2452	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2453	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2454	} else {
2455	// When a physical register is available read the value from it and glue
2456	// the reads together.
2457	ArgValueLo =
2458	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2459	*InFlag = ArgValueLo.getValue(2);
2460	ArgValueHi =
2461	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2462	*InFlag = ArgValueHi.getValue(2);
2463	}
2464
2465	// Convert the i32 type into v32i1 type
2466	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2467
2468	// Convert the i32 type into v32i1 type
2469	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2470
2471	// Concatenate the two values together
2472	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2473	}
2474
2475	/// The function will lower a register of various sizes (8/16/32/64)
2476	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2477	/// \returns a DAG node contains the operand after lowering to mask type.
2478	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2479	const EVT &ValLoc, const SDLoc &Dl,
2480	SelectionDAG &DAG) {
2481	SDValue ValReturned = ValArg;
2482
2483	if (ValVT == MVT::v1i1)
2484	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2485
2486	if (ValVT == MVT::v64i1) {
2487	// In 32 bit machine, this case is handled by getv64i1Argument
2488	assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations" ) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2488, __PRETTY_FUNCTION__));
2489	// In 64 bit machine, There is no need to truncate the value only bitcast
2490	} else {
2491	MVT maskLen;
2492	switch (ValVT.getSimpleVT().SimpleTy) {
2493	case MVT::v8i1:
2494	maskLen = MVT::i8;
2495	break;
2496	case MVT::v16i1:
2497	maskLen = MVT::i16;
2498	break;
2499	case MVT::v32i1:
2500	maskLen = MVT::i32;
2501	break;
2502	default:
2503	llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2503);
2504	}
2505
2506	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2507	}
2508	return DAG.getBitcast(ValVT, ValReturned);
2509	}
2510
2511	/// Lower the result values of a call into the
2512	/// appropriate copies out of appropriate physical registers.
2513	///
2514	SDValue X86TargetLowering::LowerCallResult(
2515	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2516	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2517	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2518	uint32_t *RegMask) const {
2519
2520	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2521	// Assign locations to each value returned by this call.
2522	SmallVector<CCValAssign, 16> RVLocs;
2523	bool Is64Bit = Subtarget.is64Bit();
2524	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2525	*DAG.getContext());
2526	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2527
2528	// Copy all of the result registers out of their specified physreg.
2529	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2530	++I, ++InsIndex) {
2531	CCValAssign &VA = RVLocs[I];
2532	EVT CopyVT = VA.getLocVT();
2533
2534	// In some calling conventions we need to remove the used registers
2535	// from the register mask.
2536	if (RegMask) {
2537	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
2538	SubRegs.isValid(); ++SubRegs)
2539	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
2540	}
2541
2542	// If this is x86-64, and we disabled SSE, we can't return FP values
2543	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
2544	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2545	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2546	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2547	}
2548
2549	// If we prefer to use the value in xmm registers, copy it out as f80 and
2550	// use a truncate to move it from fp stack reg to xmm reg.
2551	bool RoundAfterCopy = false;
2552	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
2553	isScalarFPTypeInSSEReg(VA.getValVT())) {
2554	if (!Subtarget.hasX87())
2555	report_fatal_error("X87 register return with X87 disabled");
2556	CopyVT = MVT::f80;
2557	RoundAfterCopy = (CopyVT != VA.getLocVT());
2558	}
2559
2560	SDValue Val;
2561	if (VA.needsCustom()) {
2562	assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2563, __PRETTY_FUNCTION__))
2563	"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2563, __PRETTY_FUNCTION__));
2564	Val =
2565	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2566	} else {
2567	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2568	.getValue(1);
2569	Val = Chain.getValue(0);
2570	InFlag = Chain.getValue(2);
2571	}
2572
2573	if (RoundAfterCopy)
2574	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2575	// This truncation won't change the value.
2576	DAG.getIntPtrConstant(1, dl));
2577
2578	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2579	if (VA.getValVT().isVector() &&
2580	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
2581	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
2582	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2583	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2584	} else
2585	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2586	}
2587
2588	InVals.push_back(Val);
2589	}
2590
2591	return Chain;
2592	}
2593
2594	//===----------------------------------------------------------------------===//
2595	// C & StdCall & Fast Calling Convention implementation
2596	//===----------------------------------------------------------------------===//
2597	// StdCall calling convention seems to be standard for many Windows' API
2598	// routines and around. It differs from C calling convention just a little:
2599	// callee should clean up the stack, not caller. Symbols should be also
2600	// decorated in some fancy way :) It doesn't support any vector arguments.
2601	// For info on fast calling convention see Fast Calling Convention (tail call)
2602	// implementation LowerX86_32FastCCCallTo.
2603
2604	/// CallIsStructReturn - Determines whether a call uses struct return
2605	/// semantics.
2606	enum StructReturnType {
2607	NotStructReturn,
2608	RegStructReturn,
2609	StackStructReturn
2610	};
2611	static StructReturnType
2612	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2613	if (Outs.empty())
2614	return NotStructReturn;
2615
2616	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2617	if (!Flags.isSRet())
2618	return NotStructReturn;
2619	if (Flags.isInReg() \|\| IsMCU)
2620	return RegStructReturn;
2621	return StackStructReturn;
2622	}
2623
2624	/// Determines whether a function uses struct return semantics.
2625	static StructReturnType
2626	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2627	if (Ins.empty())
2628	return NotStructReturn;
2629
2630	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2631	if (!Flags.isSRet())
2632	return NotStructReturn;
2633	if (Flags.isInReg() \|\| IsMCU)
2634	return RegStructReturn;
2635	return StackStructReturn;
2636	}
2637
2638	/// Make a copy of an aggregate at address specified by "Src" to address
2639	/// "Dst" with size and alignment information specified by the specific
2640	/// parameter attribute. The copy will be passed as a byval function parameter.
2641	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2642	SDValue Chain, ISD::ArgFlagsTy Flags,
2643	SelectionDAG &DAG, const SDLoc &dl) {
2644	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2645
2646	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2647	/isVolatile/false, /AlwaysInline=/true,
2648	/isTailCall/false,
2649	MachinePointerInfo(), MachinePointerInfo());
2650	}
2651
2652	/// Return true if the calling convention is one that we can guarantee TCO for.
2653	static bool canGuaranteeTCO(CallingConv::ID CC) {
2654	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
2655	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
2656	CC == CallingConv::HHVM);
2657	}
2658
2659	/// Return true if we might ever do TCO for calls with this calling convention.
2660	static bool mayTailCallThisCC(CallingConv::ID CC) {
2661	switch (CC) {
2662	// C calling conventions:
2663	case CallingConv::C:
2664	case CallingConv::X86_64_Win64:
2665	case CallingConv::X86_64_SysV:
2666	// Callee pop conventions:
2667	case CallingConv::X86_ThisCall:
2668	case CallingConv::X86_StdCall:
2669	case CallingConv::X86_VectorCall:
2670	case CallingConv::X86_FastCall:
2671	return true;
2672	default:
2673	return canGuaranteeTCO(CC);
2674	}
2675	}
2676
2677	/// Return true if the function is being made into a tailcall target by
2678	/// changing its ABI.
2679	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2680	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2681	}
2682
2683	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2684	auto Attr =
2685	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2686	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
2687	return false;
2688
2689	ImmutableCallSite CS(CI);
2690	CallingConv::ID CalleeCC = CS.getCallingConv();
2691	if (!mayTailCallThisCC(CalleeCC))
2692	return false;
2693
2694	return true;
2695	}
2696
2697	SDValue
2698	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2699	const SmallVectorImpl<ISD::InputArg> &Ins,
2700	const SDLoc &dl, SelectionDAG &DAG,
2701	const CCValAssign &VA,
2702	MachineFrameInfo &MFI, unsigned i) const {
2703	// Create the nodes corresponding to a load from this parameter slot.
2704	ISD::ArgFlagsTy Flags = Ins[i].Flags;
2705	bool AlwaysUseMutable = shouldGuaranteeTCO(
2706	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2707	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2708	EVT ValVT;
2709	MVT PtrVT = getPointerTy(DAG.getDataLayout());
2710
2711	// If value is passed by pointer we have address passed instead of the value
2712	// itself. No need to extend if the mask value and location share the same
2713	// absolute size.
2714	bool ExtendedInMem =
2715	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2716	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2717
2718	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
2719	ValVT = VA.getLocVT();
2720	else
2721	ValVT = VA.getValVT();
2722
2723	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
2724	// taken by a return address.
2725	int Offset = 0;
2726	if (CallConv == CallingConv::X86_INTR) {
2727	// X86 interrupts may take one or two arguments.
2728	// On the stack there will be no return address as in regular call.
2729	// Offset of last argument need to be set to -4/-8 bytes.
2730	// Where offset of the first argument out of two, should be set to 0 bytes.
2731	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2732	if (Subtarget.is64Bit() && Ins.size() == 2) {
2733	// The stack pointer needs to be realigned for 64 bit handlers with error
2734	// code, so the argument offset changes by 8 bytes.
2735	Offset += 8;
2736	}
2737	}
2738
2739	// FIXME: For now, all byval parameter objects are marked mutable. This can be
2740	// changed with more analysis.
2741	// In case of tail call optimization mark all arguments mutable. Since they
2742	// could be overwritten by lowering of arguments in case of a tail call.
2743	if (Flags.isByVal()) {
2744	unsigned Bytes = Flags.getByValSize();
2745	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2746	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2747	// Adjust SP offset of interrupt parameter.
2748	if (CallConv == CallingConv::X86_INTR) {
2749	MFI.setObjectOffset(FI, Offset);
2750	}
2751	return DAG.getFrameIndex(FI, PtrVT);
2752	}
2753
2754	// This is an argument in memory. We might be able to perform copy elision.
2755	if (Flags.isCopyElisionCandidate()) {
2756	EVT ArgVT = Ins[i].ArgVT;
2757	SDValue PartAddr;
2758	if (Ins[i].PartOffset == 0) {
2759	// If this is a one-part value or the first part of a multi-part value,
2760	// create a stack object for the entire argument value type and return a
2761	// load from our portion of it. This assumes that if the first part of an
2762	// argument is in memory, the rest will also be in memory.
2763	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2764	/Immutable=/false);
2765	PartAddr = DAG.getFrameIndex(FI, PtrVT);
2766	return DAG.getLoad(
2767	ValVT, dl, Chain, PartAddr,
2768	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2769	} else {
2770	// This is not the first piece of an argument in memory. See if there is
2771	// already a fixed stack object including this offset. If so, assume it
2772	// was created by the PartOffset == 0 branch above and create a load from
2773	// the appropriate offset into it.
2774	int64_t PartBegin = VA.getLocMemOffset();
2775	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2776	int FI = MFI.getObjectIndexBegin();
2777	for (; MFI.isFixedObjectIndex(FI); ++FI) {
2778	int64_t ObjBegin = MFI.getObjectOffset(FI);
2779	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2780	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2781	break;
2782	}
2783	if (MFI.isFixedObjectIndex(FI)) {
2784	SDValue Addr =
2785	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2786	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2787	return DAG.getLoad(
2788	ValVT, dl, Chain, Addr,
2789	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2790	Ins[i].PartOffset));
2791	}
2792	}
2793	}
2794
2795	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2796	VA.getLocMemOffset(), isImmutable);
2797
2798	// Set SExt or ZExt flag.
2799	if (VA.getLocInfo() == CCValAssign::ZExt) {
2800	MFI.setObjectZExt(FI, true);
2801	} else if (VA.getLocInfo() == CCValAssign::SExt) {
2802	MFI.setObjectSExt(FI, true);
2803	}
2804
2805	// Adjust SP offset of interrupt parameter.
2806	if (CallConv == CallingConv::X86_INTR) {
2807	MFI.setObjectOffset(FI, Offset);
2808	}
2809
2810	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2811	SDValue Val = DAG.getLoad(
2812	ValVT, dl, Chain, FIN,
2813	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2814	return ExtendedInMem
2815	? (VA.getValVT().isVector()
2816	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2817	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2818	: Val;
2819	}
2820
2821	// FIXME: Get this from tablegen.
2822	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2823	const X86Subtarget &Subtarget) {
2824	assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2824, __PRETTY_FUNCTION__));
2825
2826	if (Subtarget.isCallingConvWin64(CallConv)) {
2827	static const MCPhysReg GPR64ArgRegsWin64[] = {
2828	X86::RCX, X86::RDX, X86::R8, X86::R9
2829	};
2830	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2831	}
2832
2833	static const MCPhysReg GPR64ArgRegs64Bit[] = {
2834	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2835	};
2836	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2837	}
2838
2839	// FIXME: Get this from tablegen.
2840	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2841	CallingConv::ID CallConv,
2842	const X86Subtarget &Subtarget) {
2843	assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2843, __PRETTY_FUNCTION__));
2844	if (Subtarget.isCallingConvWin64(CallConv)) {
2845	// The XMM registers which might contain var arg parameters are shadowed
2846	// in their paired GPR. So we only need to save the GPR to their home
2847	// slots.
2848	// TODO: __vectorcall will change this.
2849	return None;
2850	}
2851
2852	const Function *Fn = MF.getFunction();
2853	bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2854	bool isSoftFloat = Subtarget.useSoftFloat();
2855	assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2856, __PRETTY_FUNCTION__))
2856	"SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2856, __PRETTY_FUNCTION__));
2857	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
2858	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
2859	// registers.
2860	return None;
2861
2862	static const MCPhysReg XMMArgRegs64Bit[] = {
2863	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2864	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2865	};
2866	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2867	}
2868
2869	#ifndef NDEBUG
2870	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2871	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2872	[](const CCValAssign &A, const CCValAssign &B) -> bool {
2873	return A.getValNo() < B.getValNo();
2874	});
2875	}
2876	#endif
2877
2878	SDValue X86TargetLowering::LowerFormalArguments(
2879	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2880	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2881	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2882	MachineFunction &MF = DAG.getMachineFunction();
2883	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2884	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2885
2886	const Function *Fn = MF.getFunction();
2887	if (Fn->hasExternalLinkage() &&
2888	Subtarget.isTargetCygMing() &&
2889	Fn->getName() == "main")
2890	FuncInfo->setForceFramePointer(true);
2891
2892	MachineFrameInfo &MFI = MF.getFrameInfo();
2893	bool Is64Bit = Subtarget.is64Bit();
2894	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2895
2896	assert(((!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2898, __PRETTY_FUNCTION__))
2897	!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2898, __PRETTY_FUNCTION__))
2898	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2898, __PRETTY_FUNCTION__));
2899
2900	if (CallConv == CallingConv::X86_INTR) {
2901	bool isLegal = Ins.size() == 1 \|\|
2902	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
2903	(!Is64Bit && Ins[1].VT == MVT::i32)));
2904	if (!isLegal)
2905	report_fatal_error("X86 interrupts may take one or two arguments");
2906	}
2907
2908	// Assign locations to all of the incoming arguments.
2909	SmallVector<CCValAssign, 16> ArgLocs;
2910	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2911
2912	// Allocate shadow area for Win64.
2913	if (IsWin64)
2914	CCInfo.AllocateStack(32, 8);
2915
2916	CCInfo.AnalyzeArguments(Ins, CC_X86);
2917
2918	// In vectorcall calling convention a second pass is required for the HVA
2919	// types.
2920	if (CallingConv::X86_VectorCall == CallConv) {
2921	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2922	}
2923
2924	// The next loop assumes that the locations are in the same order of the
2925	// input arguments.
2926	assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering" ) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2927, __PRETTY_FUNCTION__))
2927	"Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering" ) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2927, __PRETTY_FUNCTION__));
2928
2929	SDValue ArgValue;
2930	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2931	++I, ++InsIndex) {
2932	assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast <void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2932, __PRETTY_FUNCTION__));
2933	CCValAssign &VA = ArgLocs[I];
2934
2935	if (VA.isRegLoc()) {
2936	EVT RegVT = VA.getLocVT();
2937	if (VA.needsCustom()) {
2938	assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2940, __PRETTY_FUNCTION__))
2939	VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2940, __PRETTY_FUNCTION__))
2940	"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2940, __PRETTY_FUNCTION__));
2941
2942	// v64i1 values, in regcall calling convention, that are
2943	// compiled to 32 bit arch, are split up into two registers.
2944	ArgValue =
2945	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2946	} else {
2947	const TargetRegisterClass *RC;
2948	if (RegVT == MVT::i32)
2949	RC = &X86::GR32RegClass;
2950	else if (Is64Bit && RegVT == MVT::i64)
2951	RC = &X86::GR64RegClass;
2952	else if (RegVT == MVT::f32)
2953	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2954	else if (RegVT == MVT::f64)
2955	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2956	else if (RegVT == MVT::f80)
2957	RC = &X86::RFP80RegClass;
2958	else if (RegVT == MVT::f128)
2959	RC = &X86::FR128RegClass;
2960	else if (RegVT.is512BitVector())
2961	RC = &X86::VR512RegClass;
2962	else if (RegVT.is256BitVector())
2963	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2964	else if (RegVT.is128BitVector())
2965	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2966	else if (RegVT == MVT::x86mmx)
2967	RC = &X86::VR64RegClass;
2968	else if (RegVT == MVT::v1i1)
2969	RC = &X86::VK1RegClass;
2970	else if (RegVT == MVT::v8i1)
2971	RC = &X86::VK8RegClass;
2972	else if (RegVT == MVT::v16i1)
2973	RC = &X86::VK16RegClass;
2974	else if (RegVT == MVT::v32i1)
2975	RC = &X86::VK32RegClass;
2976	else if (RegVT == MVT::v64i1)
2977	RC = &X86::VK64RegClass;
2978	else
2979	llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 2979);
2980
2981	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2982	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2983	}
2984
2985	// If this is an 8 or 16-bit value, it is really passed promoted to 32
2986	// bits. Insert an assert[sz]ext to capture this, then truncate to the
2987	// right size.
2988	if (VA.getLocInfo() == CCValAssign::SExt)
2989	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2990	DAG.getValueType(VA.getValVT()));
2991	else if (VA.getLocInfo() == CCValAssign::ZExt)
2992	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2993	DAG.getValueType(VA.getValVT()));
2994	else if (VA.getLocInfo() == CCValAssign::BCvt)
2995	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2996
2997	if (VA.isExtInLoc()) {
2998	// Handle MMX values passed in XMM regs.
2999	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3000	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3001	else if (VA.getValVT().isVector() &&
3002	VA.getValVT().getScalarType() == MVT::i1 &&
3003	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
3004	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
3005	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3006	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3007	} else
3008	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3009	}
3010	} else {
3011	assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail ("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3011, __PRETTY_FUNCTION__));
3012	ArgValue =
3013	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3014	}
3015
3016	// If value is passed via pointer - do a load.
3017	if (VA.getLocInfo() == CCValAssign::Indirect)
3018	ArgValue =
3019	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3020
3021	InVals.push_back(ArgValue);
3022	}
3023
3024	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3025	// Swift calling convention does not require we copy the sret argument
3026	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3027	if (CallConv == CallingConv::Swift)
3028	continue;
3029
3030	// All x86 ABIs require that for returning structs by value we copy the
3031	// sret argument into %rax/%eax (depending on ABI) for the return. Save
3032	// the argument into a virtual register so that we can access it from the
3033	// return points.
3034	if (Ins[I].Flags.isSRet()) {
3035	unsigned Reg = FuncInfo->getSRetReturnReg();
3036	if (!Reg) {
3037	MVT PtrTy = getPointerTy(DAG.getDataLayout());
3038	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3039	FuncInfo->setSRetReturnReg(Reg);
3040	}
3041	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3042	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3043	break;
3044	}
3045	}
3046
3047	unsigned StackSize = CCInfo.getNextStackOffset();
3048	// Align stack specially for tail calls.
3049	if (shouldGuaranteeTCO(CallConv,
3050	MF.getTarget().Options.GuaranteedTailCallOpt))
3051	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3052
3053	// If the function takes variable number of arguments, make a frame index for
3054	// the start of the first vararg value... for expansion of llvm.va_start. We
3055	// can skip this if there are no va_start calls.
3056	if (MFI.hasVAStart() &&
3057	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
3058	CallConv != CallingConv::X86_ThisCall))) {
3059	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3060	}
3061
3062	// Figure out if XMM registers are in use.
3063	assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && Fn->hasFnAttribute (Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3065, __PRETTY_FUNCTION__))
3064	Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && Fn->hasFnAttribute (Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3065, __PRETTY_FUNCTION__))
3065	"SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && Fn->hasFnAttribute (Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3065, __PRETTY_FUNCTION__));
3066
3067	// 64-bit calling conventions support varargs and register parameters, so we
3068	// have to do extra work to spill them in the prologue.
3069	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3070	// Find the first unallocated argument registers.
3071	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3072	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3073	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3074	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3075	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3076, __PRETTY_FUNCTION__))
3076	"SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!" ) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3076, __PRETTY_FUNCTION__));
3077
3078	// Gather all the live in physical registers.
3079	SmallVector<SDValue, 6> LiveGPRs;
3080	SmallVector<SDValue, 8> LiveXMMRegs;
3081	SDValue ALVal;
3082	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3083	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3084	LiveGPRs.push_back(
3085	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3086	}
3087	if (!ArgXMMs.empty()) {
3088	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3089	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3090	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3091	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3092	LiveXMMRegs.push_back(
3093	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3094	}
3095	}
3096
3097	if (IsWin64) {
3098	// Get to the caller-allocated home save location. Add 8 to account
3099	// for the return address.
3100	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3101	FuncInfo->setRegSaveFrameIndex(
3102	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3103	// Fixup to set vararg frame on shadow area (4 x i64).
3104	if (NumIntRegs < 4)
3105	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3106	} else {
3107	// For X86-64, if there are vararg parameters that are passed via
3108	// registers, then we must store them to their spots on the stack so
3109	// they may be loaded by dereferencing the result of va_next.
3110	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3111	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3112	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3113	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3114	}
3115
3116	// Store the integer parameter registers.
3117	SmallVector<SDValue, 8> MemOps;
3118	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3119	getPointerTy(DAG.getDataLayout()));
3120	unsigned Offset = FuncInfo->getVarArgsGPOffset();
3121	for (SDValue Val : LiveGPRs) {
3122	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3123	RSFIN, DAG.getIntPtrConstant(Offset, dl));
3124	SDValue Store =
3125	DAG.getStore(Val.getValue(1), dl, Val, FIN,
3126	MachinePointerInfo::getFixedStack(
3127	DAG.getMachineFunction(),
3128	FuncInfo->getRegSaveFrameIndex(), Offset));
3129	MemOps.push_back(Store);
3130	Offset += 8;
3131	}
3132
3133	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3134	// Now store the XMM (fp + vector) parameter registers.
3135	SmallVector<SDValue, 12> SaveXMMOps;
3136	SaveXMMOps.push_back(Chain);
3137	SaveXMMOps.push_back(ALVal);
3138	SaveXMMOps.push_back(DAG.getIntPtrConstant(
3139	FuncInfo->getRegSaveFrameIndex(), dl));
3140	SaveXMMOps.push_back(DAG.getIntPtrConstant(
3141	FuncInfo->getVarArgsFPOffset(), dl));
3142	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3143	LiveXMMRegs.end());
3144	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3145	MVT::Other, SaveXMMOps));
3146	}
3147
3148	if (!MemOps.empty())
3149	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3150	}
3151
3152	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3153	// Find the largest legal vector type.
3154	MVT VecVT = MVT::Other;
3155	// FIXME: Only some x86_32 calling conventions support AVX512.
3156	if (Subtarget.hasAVX512() &&
3157	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
3158	CallConv == CallingConv::Intel_OCL_BI)))
3159	VecVT = MVT::v16f32;
3160	else if (Subtarget.hasAVX())
3161	VecVT = MVT::v8f32;
3162	else if (Subtarget.hasSSE2())
3163	VecVT = MVT::v4f32;
3164
3165	// We forward some GPRs and some vector types.
3166	SmallVector<MVT, 2> RegParmTypes;
3167	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3168	RegParmTypes.push_back(IntVT);
3169	if (VecVT != MVT::Other)
3170	RegParmTypes.push_back(VecVT);
3171
3172	// Compute the set of forwarded registers. The rest are scratch.
3173	SmallVectorImpl<ForwardedRegister> &Forwards =
3174	FuncInfo->getForwardedMustTailRegParms();
3175	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3176
3177	// Conservatively forward AL on x86_64, since it might be used for varargs.
3178	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3179	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3180	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3181	}
3182
3183	// Copy all forwards from physical to virtual registers.
3184	for (ForwardedRegister &F : Forwards) {
3185	// FIXME: Can we use a less constrained schedule?
3186	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3187	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3188	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3189	}
3190	}
3191
3192	// Some CCs need callee pop.
3193	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3194	MF.getTarget().Options.GuaranteedTailCallOpt)) {
3195	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3196	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3197	// X86 interrupts must pop the error code (and the alignment padding) if
3198	// present.
3199	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3200	} else {
3201	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3202	// If this is an sret function, the return should pop the hidden pointer.
3203	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3204	!Subtarget.getTargetTriple().isOSMSVCRT() &&
3205	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3206	FuncInfo->setBytesToPopOnReturn(4);
3207	}
3208
3209	if (!Is64Bit) {
3210	// RegSaveFrameIndex is X86-64 only.
3211	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3212	if (CallConv == CallingConv::X86_FastCall \|\|
3213	CallConv == CallingConv::X86_ThisCall)
3214	// fastcc functions can't have varargs.
3215	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3216	}
3217
3218	FuncInfo->setArgumentStackSize(StackSize);
3219
3220	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3221	EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3222	if (Personality == EHPersonality::CoreCLR) {
3223	assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3223, __PRETTY_FUNCTION__));
3224	// TODO: Add a mechanism to frame lowering that will allow us to indicate
3225	// that we'd prefer this slot be allocated towards the bottom of the frame
3226	// (i.e. near the stack pointer after allocating the frame). Every
3227	// funclet needs a copy of this slot in its (mostly empty) frame, and the
3228	// offset from the bottom of this and each funclet's frame must be the
3229	// same, so the size of funclets' (mostly empty) frames is dictated by
3230	// how far this slot is from the bottom (since they allocate just enough
3231	// space to accommodate holding this slot at the correct offset).
3232	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
3233	EHInfo->PSPSymFrameIdx = PSPSymFI;
3234	}
3235	}
3236
3237	if (CallConv == CallingConv::X86_RegCall \|\|
3238	Fn->hasFnAttribute("no_caller_saved_registers")) {
3239	const MachineRegisterInfo &MRI = MF.getRegInfo();
3240	for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3241	MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3242	}
3243
3244	return Chain;
3245	}
3246
3247	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3248	SDValue Arg, const SDLoc &dl,
3249	SelectionDAG &DAG,
3250	const CCValAssign &VA,
3251	ISD::ArgFlagsTy Flags) const {
3252	unsigned LocMemOffset = VA.getLocMemOffset();
3253	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3254	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3255	StackPtr, PtrOff);
3256	if (Flags.isByVal())
3257	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3258
3259	return DAG.getStore(
3260	Chain, dl, Arg, PtrOff,
3261	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3262	}
3263
3264	/// Emit a load of return address if tail call
3265	/// optimization is performed and it is required.
3266	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3267	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3268	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3269	// Adjust the Return address stack slot.
3270	EVT VT = getPointerTy(DAG.getDataLayout());
3271	OutRetAddr = getReturnAddressFrameIndex(DAG);
3272
3273	// Load the "old" Return address.
3274	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3275	return SDValue(OutRetAddr.getNode(), 1);
3276	}
3277
3278	/// Emit a store of the return address if tail call
3279	/// optimization is performed and it is required (FPDiff!=0).
3280	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3281	SDValue Chain, SDValue RetAddrFrIdx,
3282	EVT PtrVT, unsigned SlotSize,
3283	int FPDiff, const SDLoc &dl) {
3284	// Store the return address to the appropriate stack slot.
3285	if (!FPDiff) return Chain;
3286	// Calculate the new stack slot for the return address.
3287	int NewReturnAddrFI =
3288	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3289	false);
3290	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3291	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3292	MachinePointerInfo::getFixedStack(
3293	DAG.getMachineFunction(), NewReturnAddrFI));
3294	return Chain;
3295	}
3296
3297	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
3298	/// operation of specified width.
3299	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3300	SDValue V2) {
3301	unsigned NumElems = VT.getVectorNumElements();
3302	SmallVector<int, 8> Mask;
3303	Mask.push_back(NumElems);
3304	for (unsigned i = 1; i != NumElems; ++i)
3305	Mask.push_back(i);
3306	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3307	}
3308
3309	SDValue
3310	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3311	SmallVectorImpl<SDValue> &InVals) const {
3312	SelectionDAG &DAG = CLI.DAG;
3313	SDLoc &dl = CLI.DL;
3314	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3315	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3316	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3317	SDValue Chain = CLI.Chain;
3318	SDValue Callee = CLI.Callee;
3319	CallingConv::ID CallConv = CLI.CallConv;
3320	bool &isTailCall = CLI.IsTailCall;
3321	bool isVarArg = CLI.IsVarArg;
3322
3323	MachineFunction &MF = DAG.getMachineFunction();
3324	bool Is64Bit = Subtarget.is64Bit();
3325	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3326	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3327	bool IsSibcall = false;
3328	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3329	auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3330	const CallInst *CI =
3331	CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3332	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3333	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
3334	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3335
3336	if (CallConv == CallingConv::X86_INTR)
3337	report_fatal_error("X86 interrupts may not be called directly");
3338
3339	if (Attr.getValueAsString() == "true")
3340	isTailCall = false;
3341
3342	if (Subtarget.isPICStyleGOT() &&
3343	!MF.getTarget().Options.GuaranteedTailCallOpt) {
3344	// If we are using a GOT, disable tail calls to external symbols with
3345	// default visibility. Tail calling such a symbol requires using a GOT
3346	// relocation, which forces early binding of the symbol. This breaks code
3347	// that require lazy function symbol resolution. Using musttail or
3348	// GuaranteedTailCallOpt will override this.
3349	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3350	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
3351	G->getGlobal()->hasDefaultVisibility()))
3352	isTailCall = false;
3353	}
3354
3355	bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3356	if (IsMustTail) {
3357	// Force this to be a tail call. The verifier rules are enough to ensure
3358	// that we can lower this successfully without moving the return address
3359	// around.
3360	isTailCall = true;
3361	} else if (isTailCall) {
3362	// Check if it's really possible to do a tail call.
3363	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3364	isVarArg, SR != NotStructReturn,
3365	MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3366	Outs, OutVals, Ins, DAG);
3367
3368	// Sibcalls are automatically detected tailcalls which do not require
3369	// ABI changes.
3370	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3371	IsSibcall = true;
3372
3373	if (isTailCall)
3374	++NumTailCalls;
3375	}
3376
3377	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe" ) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3378, __PRETTY_FUNCTION__))
3378	"Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe" ) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3378, __PRETTY_FUNCTION__));
3379
3380	// Analyze operands of the call, assigning locations to each operand.
3381	SmallVector<CCValAssign, 16> ArgLocs;
3382	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3383
3384	// Allocate shadow area for Win64.
3385	if (IsWin64)
3386	CCInfo.AllocateStack(32, 8);
3387
3388	CCInfo.AnalyzeArguments(Outs, CC_X86);
3389
3390	// In vectorcall calling convention a second pass is required for the HVA
3391	// types.
3392	if (CallingConv::X86_VectorCall == CallConv) {
3393	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3394	}
3395
3396	// Get a count of how many bytes are to be pushed on the stack.
3397	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3398	if (IsSibcall)
3399	// This is a sibcall. The memory operands are available in caller's
3400	// own caller's stack.
3401	NumBytes = 0;
3402	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3403	canGuaranteeTCO(CallConv))
3404	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3405
3406	int FPDiff = 0;
3407	if (isTailCall && !IsSibcall && !IsMustTail) {
3408	// Lower arguments at fp - stackoffset + fpdiff.
3409	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3410
3411	FPDiff = NumBytesCallerPushed - NumBytes;
3412
3413	// Set the delta of movement of the returnaddr stackslot.
3414	// But only set if delta is greater than previous delta.
3415	if (FPDiff < X86Info->getTCReturnAddrDelta())
3416	X86Info->setTCReturnAddrDelta(FPDiff);
3417	}
3418
3419	unsigned NumBytesToPush = NumBytes;
3420	unsigned NumBytesToPop = NumBytes;
3421
3422	// If we have an inalloca argument, all stack space has already been allocated
3423	// for us and be right at the top of the stack. We don't support multiple
3424	// arguments passed in memory when using inalloca.
3425	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3426	NumBytesToPush = 0;
3427	if (!ArgLocs.back().isMemLoc())
3428	report_fatal_error("cannot use inalloca attribute on a register "
3429	"parameter");
3430	if (ArgLocs.back().getLocMemOffset() != 0)
3431	report_fatal_error("any parameter with the inalloca attribute must be "
3432	"the only memory argument");
3433	}
3434
3435	if (!IsSibcall)
3436	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3437	NumBytes - NumBytesToPush, dl);
3438
3439	SDValue RetAddrFrIdx;
3440	// Load return address for tail calls.
3441	if (isTailCall && FPDiff)
3442	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3443	Is64Bit, FPDiff, dl);
3444
3445	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3446	SmallVector<SDValue, 8> MemOpChains;
3447	SDValue StackPtr;
3448
3449	// The next loop assumes that the locations are in the same order of the
3450	// input arguments.
3451	assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering" ) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3452, __PRETTY_FUNCTION__))
3452	"Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering" ) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3452, __PRETTY_FUNCTION__));
3453
3454	// Walk the register/memloc assignments, inserting copies/loads. In the case
3455	// of tail call optimization arguments are handle later.
3456	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3457	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3458	++I, ++OutIndex) {
3459	assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ? static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3459, __PRETTY_FUNCTION__));
3460	// Skip inalloca arguments, they have already been written.
3461	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3462	if (Flags.isInAlloca())
3463	continue;
3464
3465	CCValAssign &VA = ArgLocs[I];
3466	EVT RegVT = VA.getLocVT();
3467	SDValue Arg = OutVals[OutIndex];
3468	bool isByVal = Flags.isByVal();
3469
3470	// Promote the value if needed.
3471	switch (VA.getLocInfo()) {
3472	default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3472);
3473	case CCValAssign::Full: break;
3474	case CCValAssign::SExt:
3475	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3476	break;
3477	case CCValAssign::ZExt:
3478	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3479	break;
3480	case CCValAssign::AExt:
3481	if (Arg.getValueType().isVector() &&
3482	Arg.getValueType().getVectorElementType() == MVT::i1)
3483	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3484	else if (RegVT.is128BitVector()) {
3485	// Special case: passing MMX values in XMM registers.
3486	Arg = DAG.getBitcast(MVT::i64, Arg);
3487	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3488	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3489	} else
3490	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3491	break;
3492	case CCValAssign::BCvt:
3493	Arg = DAG.getBitcast(RegVT, Arg);
3494	break;
3495	case CCValAssign::Indirect: {
3496	// Store the argument.
3497	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3498	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3499	Chain = DAG.getStore(
3500	Chain, dl, Arg, SpillSlot,
3501	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3502	Arg = SpillSlot;
3503	break;
3504	}
3505	}
3506
3507	if (VA.needsCustom()) {
3508	assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3509, __PRETTY_FUNCTION__))
3509	"Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3509, __PRETTY_FUNCTION__));
3510	// Split v64i1 value into two registers
3511	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3512	Subtarget);
3513	} else if (VA.isRegLoc()) {
3514	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3515	if (isVarArg && IsWin64) {
3516	// Win64 ABI requires argument XMM reg to be copied to the corresponding
3517	// shadow reg if callee is a varargs function.
3518	unsigned ShadowReg = 0;
3519	switch (VA.getLocReg()) {
3520	case X86::XMM0: ShadowReg = X86::RCX; break;
3521	case X86::XMM1: ShadowReg = X86::RDX; break;
3522	case X86::XMM2: ShadowReg = X86::R8; break;
3523	case X86::XMM3: ShadowReg = X86::R9; break;
3524	}
3525	if (ShadowReg)
3526	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3527	}
3528	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
3529	assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail ("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3529, __PRETTY_FUNCTION__));
3530	if (!StackPtr.getNode())
3531	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3532	getPointerTy(DAG.getDataLayout()));
3533	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3534	dl, DAG, VA, Flags));
3535	}
3536	}
3537
3538	if (!MemOpChains.empty())
3539	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3540
3541	if (Subtarget.isPICStyleGOT()) {
3542	// ELF / PIC requires GOT in the EBX register before function calls via PLT
3543	// GOT pointer.
3544	if (!isTailCall) {
3545	RegsToPass.push_back(std::make_pair(
3546	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3547	getPointerTy(DAG.getDataLayout()))));
3548	} else {
3549	// If we are tail calling and generating PIC/GOT style code load the
3550	// address of the callee into ECX. The value in ecx is used as target of
3551	// the tail jump. This is done to circumvent the ebx/callee-saved problem
3552	// for tail calls on PIC/GOT architectures. Normally we would just put the
3553	// address of GOT into ebx and then call target@PLT. But for tail calls
3554	// ebx would be restored (since ebx is callee saved) before jumping to the
3555	// target@PLT.
3556
3557	// Note: The actual moving to ECX is done further down.
3558	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3559	if (G && !G->getGlobal()->hasLocalLinkage() &&
3560	G->getGlobal()->hasDefaultVisibility())
3561	Callee = LowerGlobalAddress(Callee, DAG);
3562	else if (isa<ExternalSymbolSDNode>(Callee))
3563	Callee = LowerExternalSymbol(Callee, DAG);
3564	}
3565	}
3566
3567	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3568	// From AMD64 ABI document:
3569	// For calls that may call functions that use varargs or stdargs
3570	// (prototype-less calls or calls to functions containing ellipsis (...) in
3571	// the declaration) %al is used as hidden argument to specify the number
3572	// of SSE registers used. The contents of %al do not need to match exactly
3573	// the number of registers, but must be an ubound on the number of SSE
3574	// registers used and is in the range 0 - 8 inclusive.
3575
3576	// Count the number of XMM registers allocated.
3577	static const MCPhysReg XMMArgRegs[] = {
3578	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3579	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3580	};
3581	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3582	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)(((Subtarget.hasSSE1() \|\| !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() \|\| !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3583, __PRETTY_FUNCTION__))
3583	&& "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() \|\| !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() \|\| !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3583, __PRETTY_FUNCTION__));
3584
3585	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3586	DAG.getConstant(NumXMMRegs, dl,
3587	MVT::i8)));
3588	}
3589
3590	if (isVarArg && IsMustTail) {
3591	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3592	for (const auto &F : Forwards) {
3593	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3594	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3595	}
3596	}
3597
3598	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3599	// don't need this because the eligibility check rejects calls that require
3600	// shuffling arguments passed in memory.
3601	if (!IsSibcall && isTailCall) {
3602	// Force all the incoming stack arguments to be loaded from the stack
3603	// before any new outgoing arguments are stored to the stack, because the
3604	// outgoing stack slots may alias the incoming argument stack slots, and
3605	// the alias isn't otherwise explicit. This is slightly more conservative
3606	// than necessary, because it means that each store effectively depends
3607	// on every argument instead of just those arguments it would clobber.
3608	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3609
3610	SmallVector<SDValue, 8> MemOpChains2;
3611	SDValue FIN;
3612	int FI = 0;
3613	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3614	++I, ++OutsIndex) {
3615	CCValAssign &VA = ArgLocs[I];
3616
3617	if (VA.isRegLoc()) {
3618	if (VA.needsCustom()) {
3619	assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention" ) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3620, __PRETTY_FUNCTION__))
3620	"Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention" ) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3620, __PRETTY_FUNCTION__));
3621	// This means that we are in special case where one argument was
3622	// passed through two register locations - Skip the next location
3623	++I;
3624	}
3625
3626	continue;
3627	}
3628
3629	assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail ("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3629, __PRETTY_FUNCTION__));
3630	SDValue Arg = OutVals[OutsIndex];
3631	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3632	// Skip inalloca arguments. They don't require any work.
3633	if (Flags.isInAlloca())
3634	continue;
3635	// Create frame index.
3636	int32_t Offset = VA.getLocMemOffset()+FPDiff;
3637	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3638	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3639	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3640
3641	if (Flags.isByVal()) {
3642	// Copy relative to framepointer.
3643	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3644	if (!StackPtr.getNode())
3645	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3646	getPointerTy(DAG.getDataLayout()));
3647	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3648	StackPtr, Source);
3649
3650	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3651	ArgChain,
3652	Flags, DAG, dl));
3653	} else {
3654	// Store relative to framepointer.
3655	MemOpChains2.push_back(DAG.getStore(
3656	ArgChain, dl, Arg, FIN,
3657	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3658	}
3659	}
3660
3661	if (!MemOpChains2.empty())
3662	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3663
3664	// Store the return address to the appropriate stack slot.
3665	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3666	getPointerTy(DAG.getDataLayout()),
3667	RegInfo->getSlotSize(), FPDiff, dl);
3668	}
3669
3670	// Build a sequence of copy-to-reg nodes chained together with token chain
3671	// and flag operands which copy the outgoing args into registers.
3672	SDValue InFlag;
3673	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3674	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3675	RegsToPass[i].second, InFlag);
3676	InFlag = Chain.getValue(1);
3677	}
3678
3679	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3680	assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode." ) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3680, __PRETTY_FUNCTION__));
3681	// In the 64-bit large code model, we have to make all calls
3682	// through a register, since the call instruction's 32-bit
3683	// pc-relative offset may not be large enough to hold the whole
3684	// address.
3685	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
3686	// If the callee is a GlobalAddress node (quite common, every direct call
3687	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3688	// it.
3689	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3690
3691	// We should use extra load for direct calls to dllimported functions in
3692	// non-JIT mode.
3693	const GlobalValue *GV = G->getGlobal();
3694	if (!GV->hasDLLImportStorageClass()) {
3695	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3696
3697	Callee = DAG.getTargetGlobalAddress(
3698	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3699
3700	if (OpFlags == X86II::MO_GOTPCREL) {
3701	// Add a wrapper.
3702	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3703	getPointerTy(DAG.getDataLayout()), Callee);
3704	// Add extra indirection
3705	Callee = DAG.getLoad(
3706	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3707	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3708	}
3709	}
3710	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3711	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3712	unsigned char OpFlags =
3713	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3714
3715	Callee = DAG.getTargetExternalSymbol(
3716	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3717	} else if (Subtarget.isTarget64BitILP32() &&
3718	Callee->getValueType(0) == MVT::i32) {
3719	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3720	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3721	}
3722
3723	// Returns a chain & a flag for retval copy to use.
3724	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3725	SmallVector<SDValue, 8> Ops;
3726
3727	if (!IsSibcall && isTailCall) {
3728	Chain = DAG.getCALLSEQ_END(Chain,
3729	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3730	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3731	InFlag = Chain.getValue(1);
3732	}
3733
3734	Ops.push_back(Chain);
3735	Ops.push_back(Callee);
3736
3737	if (isTailCall)
3738	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3739
3740	// Add argument registers to the end of the list so that they are known live
3741	// into the call.
3742	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3743	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3744	RegsToPass[i].second.getValueType()));
3745
3746	// Add a register mask operand representing the call-preserved registers.
3747	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3748	// set X86_INTR calling convention because it has the same CSR mask
3749	// (same preserved registers).
3750	const uint32_t *Mask = RegInfo->getCallPreservedMask(
3751	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3752	assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention" ) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3752, __PRETTY_FUNCTION__));
3753
3754	// If this is an invoke in a 32-bit function using a funclet-based
3755	// personality, assume the function clobbers all registers. If an exception
3756	// is thrown, the runtime will not restore CSRs.
3757	// FIXME: Model this more precisely so that we can register allocate across
3758	// the normal edge and spill and fill across the exceptional edge.
3759	if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3760	const Function *CallerFn = MF.getFunction();
3761	EHPersonality Pers =
3762	CallerFn->hasPersonalityFn()
3763	? classifyEHPersonality(CallerFn->getPersonalityFn())
3764	: EHPersonality::Unknown;
3765	if (isFuncletEHPersonality(Pers))
3766	Mask = RegInfo->getNoPreservedMask();
3767	}
3768
3769	// Define a new register mask from the existing mask.
3770	uint32_t *RegMask = nullptr;
3771
3772	// In some calling conventions we need to remove the used physical registers
3773	// from the reg mask.
3774	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
3775	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3776
3777	// Allocate a new Reg Mask and copy Mask.
3778	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3779	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3780	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3781
3782	// Make sure all sub registers of the argument registers are reset
3783	// in the RegMask.
3784	for (auto const &RegPair : RegsToPass)
3785	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
3786	SubRegs.isValid(); ++SubRegs)
3787	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
3788
3789	// Create the RegMask Operand according to our updated mask.
3790	Ops.push_back(DAG.getRegisterMask(RegMask));
3791	} else {
3792	// Create the RegMask Operand according to the static mask.
3793	Ops.push_back(DAG.getRegisterMask(Mask));
3794	}
3795
3796	if (InFlag.getNode())
3797	Ops.push_back(InFlag);
3798
3799	if (isTailCall) {
3800	// We used to do:
3801	//// If this is the first return lowered for this function, add the regs
3802	//// to the liveout set for the function.
3803	// This isn't right, although it's probably harmless on x86; liveouts
3804	// should be computed from returns not tail calls. Consider a void
3805	// function making a tail call to a function returning int.
3806	MF.getFrameInfo().setHasTailCall();
3807	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3808	}
3809
3810	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3811	InFlag = Chain.getValue(1);
3812
3813	// Create the CALLSEQ_END node.
3814	unsigned NumBytesForCalleeToPop;
3815	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3816	DAG.getTarget().Options.GuaranteedTailCallOpt))
3817	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3818	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3819	!Subtarget.getTargetTriple().isOSMSVCRT() &&
3820	SR == StackStructReturn)
3821	// If this is a call to a struct-return function, the callee
3822	// pops the hidden struct pointer, so we have to push it back.
3823	// This is common for Darwin/X86, Linux & Mingw32 targets.
3824	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
3825	NumBytesForCalleeToPop = 4;
3826	else
3827	NumBytesForCalleeToPop = 0; // Callee pops nothing.
3828
3829	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3830	// No need to reset the stack after the call if the call doesn't return. To
3831	// make the MI verify, we'll pretend the callee does it for us.
3832	NumBytesForCalleeToPop = NumBytes;
3833	}
3834
3835	// Returns a flag for retval copy to use.
3836	if (!IsSibcall) {
3837	Chain = DAG.getCALLSEQ_END(Chain,
3838	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3839	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3840	true),
3841	InFlag, dl);
3842	InFlag = Chain.getValue(1);
3843	}
3844
3845	// Handle result values, copying them out of physregs into vregs that we
3846	// return.
3847	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3848	InVals, RegMask);
3849	}
3850
3851	//===----------------------------------------------------------------------===//
3852	// Fast Calling Convention (tail call) implementation
3853	//===----------------------------------------------------------------------===//
3854
3855	// Like std call, callee cleans arguments, convention except that ECX is
3856	// reserved for storing the tail called function address. Only 2 registers are
3857	// free for argument passing (inreg). Tail call optimization is performed
3858	// provided:
3859	// * tailcallopt is enabled
3860	// * caller/callee are fastcc
3861	// On X86_64 architecture with GOT-style position independent code only local
3862	// (within module) calls are supported at the moment.
3863	// To keep the stack aligned according to platform abi the function
3864	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
3865	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3866	// If a tail called function callee has more arguments than the caller the
3867	// caller needs to make sure that there is room to move the RETADDR to. This is
3868	// achieved by reserving an area the size of the argument delta right after the
3869	// original RETADDR, but before the saved framepointer or the spilled registers
3870	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3871	// stack layout:
3872	// arg1
3873	// arg2
3874	// RETADDR
3875	// [ new RETADDR
3876	// move area ]
3877	// (possible EBP)
3878	// ESI
3879	// EDI
3880	// local1 ..
3881
3882	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3883	/// requirement.
3884	unsigned
3885	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3886	SelectionDAG& DAG) const {
3887	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3888	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3889	unsigned StackAlignment = TFI.getStackAlignment();
3890	uint64_t AlignMask = StackAlignment - 1;
3891	int64_t Offset = StackSize;
3892	unsigned SlotSize = RegInfo->getSlotSize();
3893	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3894	// Number smaller than 12 so just add the difference.
3895	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3896	} else {
3897	// Mask out lower bits, add stackalignment once plus the 12 bytes.
3898	Offset = ((~AlignMask) & Offset) + StackAlignment +
3899	(StackAlignment-SlotSize);
3900	}
3901	return Offset;
3902	}
3903
3904	/// Return true if the given stack call argument is already available in the
3905	/// same position (relatively) of the caller's incoming argument stack.
3906	static
3907	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3908	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3909	const X86InstrInfo *TII, const CCValAssign &VA) {
3910	unsigned Bytes = Arg.getValueSizeInBits() / 8;
3911
3912	for (;;) {
3913	// Look through nodes that don't alter the bits of the incoming value.
3914	unsigned Op = Arg.getOpcode();
3915	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
3916	Arg = Arg.getOperand(0);
3917	continue;
3918	}
3919	if (Op == ISD::TRUNCATE) {
3920	const SDValue &TruncInput = Arg.getOperand(0);
3921	if (TruncInput.getOpcode() == ISD::AssertZext &&
3922	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3923	Arg.getValueType()) {
3924	Arg = TruncInput.getOperand(0);
3925	continue;
3926	}
3927	}
3928	break;
3929	}
3930
3931	int FI = INT_MAX2147483647;
3932	if (Arg.getOpcode() == ISD::CopyFromReg) {
3933	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3934	if (!TargetRegisterInfo::isVirtualRegister(VR))
3935	return false;
3936	MachineInstr *Def = MRI->getVRegDef(VR);
3937	if (!Def)
3938	return false;
3939	if (!Flags.isByVal()) {
3940	if (!TII->isLoadFromStackSlot(*Def, FI))
3941	return false;
3942	} else {
3943	unsigned Opcode = Def->getOpcode();
3944	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
3945	Opcode == X86::LEA64_32r) &&
3946	Def->getOperand(1).isFI()) {
3947	FI = Def->getOperand(1).getIndex();
3948	Bytes = Flags.getByValSize();
3949	} else
3950	return false;
3951	}
3952	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3953	if (Flags.isByVal())
3954	// ByVal argument is passed in as a pointer but it's now being
3955	// dereferenced. e.g.
3956	// define @foo(%struct.X* %A) {
3957	// tail call @bar(%struct.X* byval %A)
3958	// }
3959	return false;
3960	SDValue Ptr = Ld->getBasePtr();
3961	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3962	if (!FINode)
3963	return false;
3964	FI = FINode->getIndex();
3965	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3966	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3967	FI = FINode->getIndex();
3968	Bytes = Flags.getByValSize();
3969	} else
3970	return false;
3971
3972	assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail ("FI != INT_MAX", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 3972, __PRETTY_FUNCTION__));
3973	if (!MFI.isFixedObjectIndex(FI))
3974	return false;
3975
3976	if (Offset != MFI.getObjectOffset(FI))
3977	return false;
3978
3979	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3980	// If the argument location is wider than the argument type, check that any
3981	// extension flags match.
3982	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
3983	Flags.isSExt() != MFI.isObjectSExt(FI)) {
3984	return false;
3985	}
3986	}
3987
3988	return Bytes == MFI.getObjectSize(FI);
3989	}
3990
3991	/// Check whether the call is eligible for tail call optimization. Targets
3992	/// that want to do tail call optimization should implement this function.
3993	bool X86TargetLowering::IsEligibleForTailCallOptimization(
3994	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3995	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3996	const SmallVectorImpl<ISD::OutputArg> &Outs,
3997	const SmallVectorImpl<SDValue> &OutVals,
3998	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3999	if (!mayTailCallThisCC(CalleeCC))
4000	return false;
4001
4002	// If -tailcallopt is specified, make fastcc functions tail-callable.
4003	MachineFunction &MF = DAG.getMachineFunction();
4004	const Function *CallerF = MF.getFunction();
4005
4006	// If the function return type is x86_fp80 and the callee return type is not,
4007	// then the FP_EXTEND of the call result is not a nop. It's not safe to
4008	// perform a tailcall optimization here.
4009	if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4010	return false;
4011
4012	CallingConv::ID CallerCC = CallerF->getCallingConv();
4013	bool CCMatch = CallerCC == CalleeCC;
4014	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4015	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4016
4017	// Win64 functions have extra shadow space for argument homing. Don't do the
4018	// sibcall if the caller and callee have mismatched expectations for this
4019	// space.
4020	if (IsCalleeWin64 != IsCallerWin64)
4021	return false;
4022
4023	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4024	if (canGuaranteeTCO(CalleeCC) && CCMatch)
4025	return true;
4026	return false;
4027	}
4028
4029	// Look for obvious safe cases to perform tail call optimization that do not
4030	// require ABI changes. This is what gcc calls sibcall.
4031
4032	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4033	// emit a special epilogue.
4034	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4035	if (RegInfo->needsStackRealignment(MF))
4036	return false;
4037
4038	// Also avoid sibcall optimization if either caller or callee uses struct
4039	// return semantics.
4040	if (isCalleeStructRet \|\| isCallerStructRet)
4041	return false;
4042
4043	// Do not sibcall optimize vararg calls unless all arguments are passed via
4044	// registers.
4045	LLVMContext &C = *DAG.getContext();
4046	if (isVarArg && !Outs.empty()) {
4047	// Optimizing for varargs on Win64 is unlikely to be safe without
4048	// additional testing.
4049	if (IsCalleeWin64 \|\| IsCallerWin64)
4050	return false;
4051
4052	SmallVector<CCValAssign, 16> ArgLocs;
4053	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4054
4055	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4056	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4057	if (!ArgLocs[i].isRegLoc())
4058	return false;
4059	}
4060
4061	// If the call result is in ST0 / ST1, it needs to be popped off the x87
4062	// stack. Therefore, if it's not used by the call it is not safe to optimize
4063	// this into a sibcall.
4064	bool Unused = false;
4065	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4066	if (!Ins[i].Used) {
4067	Unused = true;
4068	break;
4069	}
4070	}
4071	if (Unused) {
4072	SmallVector<CCValAssign, 16> RVLocs;
4073	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4074	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4075	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4076	CCValAssign &VA = RVLocs[i];
4077	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
4078	return false;
4079	}
4080	}
4081
4082	// Check that the call results are passed in the same way.
4083	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4084	RetCC_X86, RetCC_X86))
4085	return false;
4086	// The callee has to preserve all registers the caller needs to preserve.
4087	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4088	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4089	if (!CCMatch) {
4090	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4091	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4092	return false;
4093	}
4094
4095	unsigned StackArgsSize = 0;
4096
4097	// If the callee takes no arguments then go on to check the results of the
4098	// call.
4099	if (!Outs.empty()) {
4100	// Check if stack adjustment is needed. For now, do not do this if any
4101	// argument is passed on the stack.
4102	SmallVector<CCValAssign, 16> ArgLocs;
4103	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4104
4105	// Allocate shadow area for Win64
4106	if (IsCalleeWin64)
4107	CCInfo.AllocateStack(32, 8);
4108
4109	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4110	StackArgsSize = CCInfo.getNextStackOffset();
4111
4112	if (CCInfo.getNextStackOffset()) {
4113	// Check if the arguments are already laid out in the right way as
4114	// the caller's fixed stack objects.
4115	MachineFrameInfo &MFI = MF.getFrameInfo();
4116	const MachineRegisterInfo *MRI = &MF.getRegInfo();
4117	const X86InstrInfo *TII = Subtarget.getInstrInfo();
4118	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4119	CCValAssign &VA = ArgLocs[i];
4120	SDValue Arg = OutVals[i];
4121	ISD::ArgFlagsTy Flags = Outs[i].Flags;
4122	if (VA.getLocInfo() == CCValAssign::Indirect)
4123	return false;
4124	if (!VA.isRegLoc()) {
4125	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4126	MFI, MRI, TII, VA))
4127	return false;
4128	}
4129	}
4130	}
4131
4132	bool PositionIndependent = isPositionIndependent();
4133	// If the tailcall address may be in a register, then make sure it's
4134	// possible to register allocate for it. In 32-bit, the call address can
4135	// only target EAX, EDX, or ECX since the tail call must be scheduled after
4136	// callee-saved registers are restored. These happen to be the same
4137	// registers used to pass 'inreg' arguments so watch out for those.
4138	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4139	!isa<ExternalSymbolSDNode>(Callee)) \|\|
4140	PositionIndependent)) {
4141	unsigned NumInRegs = 0;
4142	// In PIC we need an extra register to formulate the address computation
4143	// for the callee.
4144	unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4145
4146	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4147	CCValAssign &VA = ArgLocs[i];
4148	if (!VA.isRegLoc())
4149	continue;
4150	unsigned Reg = VA.getLocReg();
4151	switch (Reg) {
4152	default: break;
4153	case X86::EAX: case X86::EDX: case X86::ECX:
4154	if (++NumInRegs == MaxInRegs)
4155	return false;
4156	break;
4157	}
4158	}
4159	}
4160
4161	const MachineRegisterInfo &MRI = MF.getRegInfo();
4162	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4163	return false;
4164	}
4165
4166	bool CalleeWillPop =
4167	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4168	MF.getTarget().Options.GuaranteedTailCallOpt);
4169
4170	if (unsigned BytesToPop =
4171	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4172	// If we have bytes to pop, the callee must pop them.
4173	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4174	if (!CalleePopMatches)
4175	return false;
4176	} else if (CalleeWillPop && StackArgsSize > 0) {
4177	// If we don't have bytes to pop, make sure the callee doesn't pop any.
4178	return false;
4179	}
4180
4181	return true;
4182	}
4183
4184	FastISel *
4185	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4186	const TargetLibraryInfo *libInfo) const {
4187	return X86::createFastISel(funcInfo, libInfo);
4188	}
4189
4190	//===----------------------------------------------------------------------===//
4191	// Other Lowering Hooks
4192	//===----------------------------------------------------------------------===//
4193
4194	static bool MayFoldLoad(SDValue Op) {
4195	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4196	}
4197
4198	static bool MayFoldIntoStore(SDValue Op) {
4199	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4200	}
4201
4202	static bool MayFoldIntoZeroExtend(SDValue Op) {
4203	if (Op.hasOneUse()) {
4204	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4205	return (ISD::ZERO_EXTEND == Opcode);
4206	}
4207	return false;
4208	}
4209
4210	static bool isTargetShuffle(unsigned Opcode) {
4211	switch(Opcode) {
4212	default: return false;
4213	case X86ISD::BLENDI:
4214	case X86ISD::PSHUFB:
4215	case X86ISD::PSHUFD:
4216	case X86ISD::PSHUFHW:
4217	case X86ISD::PSHUFLW:
4218	case X86ISD::SHUFP:
4219	case X86ISD::INSERTPS:
4220	case X86ISD::PALIGNR:
4221	case X86ISD::VSHLDQ:
4222	case X86ISD::VSRLDQ:
4223	case X86ISD::MOVLHPS:
4224	case X86ISD::MOVLHPD:
4225	case X86ISD::MOVHLPS:
4226	case X86ISD::MOVLPS:
4227	case X86ISD::MOVLPD:
4228	case X86ISD::MOVSHDUP:
4229	case X86ISD::MOVSLDUP:
4230	case X86ISD::MOVDDUP:
4231	case X86ISD::MOVSS:
4232	case X86ISD::MOVSD:
4233	case X86ISD::UNPCKL:
4234	case X86ISD::UNPCKH:
4235	case X86ISD::VBROADCAST:
4236	case X86ISD::VPERMILPI:
4237	case X86ISD::VPERMILPV:
4238	case X86ISD::VPERM2X128:
4239	case X86ISD::VPERMIL2:
4240	case X86ISD::VPERMI:
4241	case X86ISD::VPPERM:
4242	case X86ISD::VPERMV:
4243	case X86ISD::VPERMV3:
4244	case X86ISD::VPERMIV3:
4245	case X86ISD::VZEXT_MOVL:
4246	return true;
4247	}
4248	}
4249
4250	static bool isTargetShuffleVariableMask(unsigned Opcode) {
4251	switch (Opcode) {
4252	default: return false;
4253	// Target Shuffles.
4254	case X86ISD::PSHUFB:
4255	case X86ISD::VPERMILPV:
4256	case X86ISD::VPERMIL2:
4257	case X86ISD::VPPERM:
4258	case X86ISD::VPERMV:
4259	case X86ISD::VPERMV3:
4260	case X86ISD::VPERMIV3:
4261	return true;
4262	// 'Faux' Target Shuffles.
4263	case ISD::AND:
4264	case X86ISD::ANDNP:
4265	return true;
4266	}
4267	}
4268
4269	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4270	MachineFunction &MF = DAG.getMachineFunction();
4271	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4272	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4273	int ReturnAddrIndex = FuncInfo->getRAIndex();
4274
4275	if (ReturnAddrIndex == 0) {
4276	// Set up a frame object for the return address.
4277	unsigned SlotSize = RegInfo->getSlotSize();
4278	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4279	-(int64_t)SlotSize,
4280	false);
4281	FuncInfo->setRAIndex(ReturnAddrIndex);
4282	}
4283
4284	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4285	}
4286
4287	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4288	bool hasSymbolicDisplacement) {
4289	// Offset should fit into 32 bit immediate field.
4290	if (!isInt<32>(Offset))
4291	return false;
4292
4293	// If we don't have a symbolic displacement - we don't have any extra
4294	// restrictions.
4295	if (!hasSymbolicDisplacement)
4296	return true;
4297
4298	// FIXME: Some tweaks might be needed for medium code model.
4299	if (M != CodeModel::Small && M != CodeModel::Kernel)
4300	return false;
4301
4302	// For small code model we assume that latest object is 16MB before end of 31
4303	// bits boundary. We may also accept pretty large negative constants knowing
4304	// that all objects are in the positive half of address space.
4305	if (M == CodeModel::Small && Offset < 1610241024)
4306	return true;
4307
4308	// For kernel code model we know that all object resist in the negative half
4309	// of 32bits address space. We may not accept negative offsets, since they may
4310	// be just off and we may accept pretty large positive ones.
4311	if (M == CodeModel::Kernel && Offset >= 0)
4312	return true;
4313
4314	return false;
4315	}
4316
4317	/// Determines whether the callee is required to pop its own arguments.
4318	/// Callee pop is necessary to support tail calls.
4319	bool X86::isCalleePop(CallingConv::ID CallingConv,
4320	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4321	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
4322	// can guarantee TCO.
4323	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4324	return true;
4325
4326	switch (CallingConv) {
4327	default:
4328	return false;
4329	case CallingConv::X86_StdCall:
4330	case CallingConv::X86_FastCall:
4331	case CallingConv::X86_ThisCall:
4332	case CallingConv::X86_VectorCall:
4333	return !is64Bit;
4334	}
4335	}
4336
4337	/// \brief Return true if the condition is an unsigned comparison operation.
4338	static bool isX86CCUnsigned(unsigned X86CC) {
4339	switch (X86CC) {
4340	default:
4341	llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4341);
4342	case X86::COND_E:
4343	case X86::COND_NE:
4344	case X86::COND_B:
4345	case X86::COND_A:
4346	case X86::COND_BE:
4347	case X86::COND_AE:
4348	return true;
4349	case X86::COND_G:
4350	case X86::COND_GE:
4351	case X86::COND_L:
4352	case X86::COND_LE:
4353	return false;
4354	}
4355	}
4356
4357	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4358	switch (SetCCOpcode) {
4359	default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4359);
4360	case ISD::SETEQ: return X86::COND_E;
4361	case ISD::SETGT: return X86::COND_G;
4362	case ISD::SETGE: return X86::COND_GE;
4363	case ISD::SETLT: return X86::COND_L;
4364	case ISD::SETLE: return X86::COND_LE;
4365	case ISD::SETNE: return X86::COND_NE;
4366	case ISD::SETULT: return X86::COND_B;
4367	case ISD::SETUGT: return X86::COND_A;
4368	case ISD::SETULE: return X86::COND_BE;
4369	case ISD::SETUGE: return X86::COND_AE;
4370	}
4371	}
4372
4373	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4374	/// condition code, returning the condition code and the LHS/RHS of the
4375	/// comparison to make.
4376	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4377	bool isFP, SDValue &LHS, SDValue &RHS,
4378	SelectionDAG &DAG) {
4379	if (!isFP) {
4380	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4381	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4382	// X > -1 -> X == 0, jump !sign.
4383	RHS = DAG.getConstant(0, DL, RHS.getValueType());
4384	return X86::COND_NS;
4385	}
4386	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4387	// X < 0 -> X == 0, jump on sign.
4388	return X86::COND_S;
4389	}
4390	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4391	// X < 1 -> X <= 0
4392	RHS = DAG.getConstant(0, DL, RHS.getValueType());
4393	return X86::COND_LE;
4394	}
4395	}
4396
4397	return TranslateIntegerX86CC(SetCCOpcode);
4398	}
4399
4400	// First determine if it is required or is profitable to flip the operands.
4401
4402	// If LHS is a foldable load, but RHS is not, flip the condition.
4403	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4404	!ISD::isNON_EXTLoad(RHS.getNode())) {
4405	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4406	std::swap(LHS, RHS);
4407	}
4408
4409	switch (SetCCOpcode) {
4410	default: break;
4411	case ISD::SETOLT:
4412	case ISD::SETOLE:
4413	case ISD::SETUGT:
4414	case ISD::SETUGE:
4415	std::swap(LHS, RHS);
4416	break;
4417	}
4418
4419	// On a floating point condition, the flags are set as follows:
4420	// ZF PF CF op
4421	// 0 \| 0 \| 0 \| X > Y
4422	// 0 \| 0 \| 1 \| X < Y
4423	// 1 \| 0 \| 0 \| X == Y
4424	// 1 \| 1 \| 1 \| unordered
4425	switch (SetCCOpcode) {
4426	default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4426);
4427	case ISD::SETUEQ:
4428	case ISD::SETEQ: return X86::COND_E;
4429	case ISD::SETOLT: // flipped
4430	case ISD::SETOGT:
4431	case ISD::SETGT: return X86::COND_A;
4432	case ISD::SETOLE: // flipped
4433	case ISD::SETOGE:
4434	case ISD::SETGE: return X86::COND_AE;
4435	case ISD::SETUGT: // flipped
4436	case ISD::SETULT:
4437	case ISD::SETLT: return X86::COND_B;
4438	case ISD::SETUGE: // flipped
4439	case ISD::SETULE:
4440	case ISD::SETLE: return X86::COND_BE;
4441	case ISD::SETONE:
4442	case ISD::SETNE: return X86::COND_NE;
4443	case ISD::SETUO: return X86::COND_P;
4444	case ISD::SETO: return X86::COND_NP;
4445	case ISD::SETOEQ:
4446	case ISD::SETUNE: return X86::COND_INVALID;
4447	}
4448	}
4449
4450	/// Is there a floating point cmov for the specific X86 condition code?
4451	/// Current x86 isa includes the following FP cmov instructions:
4452	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4453	static bool hasFPCMov(unsigned X86CC) {
4454	switch (X86CC) {
4455	default:
4456	return false;
4457	case X86::COND_B:
4458	case X86::COND_BE:
4459	case X86::COND_E:
4460	case X86::COND_P:
4461	case X86::COND_A:
4462	case X86::COND_AE:
4463	case X86::COND_NE:
4464	case X86::COND_NP:
4465	return true;
4466	}
4467	}
4468
4469
4470	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4471	const CallInst &I,
4472	unsigned Intrinsic) const {
4473
4474	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4475	if (!IntrData)
4476	return false;
4477
4478	Info.opc = ISD::INTRINSIC_W_CHAIN;
4479	Info.readMem = false;
4480	Info.writeMem = false;
4481	Info.vol = false;
4482	Info.offset = 0;
4483
4484	switch (IntrData->Type) {
4485	case EXPAND_FROM_MEM: {
4486	Info.ptrVal = I.getArgOperand(0);
4487	Info.memVT = MVT::getVT(I.getType());
4488	Info.align = 1;
4489	Info.readMem = true;
4490	break;
4491	}
4492	case COMPRESS_TO_MEM: {
4493	Info.ptrVal = I.getArgOperand(0);
4494	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4495	Info.align = 1;
4496	Info.writeMem = true;
4497	break;
4498	}
4499	case TRUNCATE_TO_MEM_VI8:
4500	case TRUNCATE_TO_MEM_VI16:
4501	case TRUNCATE_TO_MEM_VI32: {
4502	Info.ptrVal = I.getArgOperand(0);
4503	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4504	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4505	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4506	ScalarVT = MVT::i8;
4507	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4508	ScalarVT = MVT::i16;
4509	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4510	ScalarVT = MVT::i32;
4511
4512	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4513	Info.align = 1;
4514	Info.writeMem = true;
4515	break;
4516	}
4517	default:
4518	return false;
4519	}
4520
4521	return true;
4522	}
4523
4524	/// Returns true if the target can instruction select the
4525	/// specified FP immediate natively. If false, the legalizer will
4526	/// materialize the FP immediate as a load from a constant pool.
4527	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4528	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4529	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4530	return true;
4531	}
4532	return false;
4533	}
4534
4535	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4536	ISD::LoadExtType ExtTy,
4537	EVT NewVT) const {
4538	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4539	// relocation target a movq or addq instruction: don't let the load shrink.
4540	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4541	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4542	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4543	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4544	return true;
4545	}
4546
4547	/// \brief Returns true if it is beneficial to convert a load of a constant
4548	/// to just the constant itself.
4549	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4550	Type *Ty) const {
4551	assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail ("Ty->isIntegerTy()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4551, __PRETTY_FUNCTION__));
4552
4553	unsigned BitSize = Ty->getPrimitiveSizeInBits();
4554	if (BitSize == 0 \|\| BitSize > 64)
4555	return false;
4556	return true;
4557	}
4558
4559	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4560	unsigned Index) const {
4561	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4562	return false;
4563
4564	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
4565	}
4566
4567	bool X86TargetLowering::isCheapToSpeculateCttz() const {
4568	// Speculate cttz only if we can directly use TZCNT.
4569	return Subtarget.hasBMI();
4570	}
4571
4572	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4573	// Speculate ctlz only if we can directly use LZCNT.
4574	return Subtarget.hasLZCNT();
4575	}
4576
4577	bool X86TargetLowering::isCtlzFast() const {
4578	return Subtarget.hasFastLZCNT();
4579	}
4580
4581	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4582	const Instruction &AndI) const {
4583	return true;
4584	}
4585
4586	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4587	if (!Subtarget.hasBMI())
4588	return false;
4589
4590	// There are only 32-bit and 64-bit forms for 'andn'.
4591	EVT VT = Y.getValueType();
4592	if (VT != MVT::i32 && VT != MVT::i64)
4593	return false;
4594
4595	return true;
4596	}
4597
4598	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4599	MVT VT = MVT::getIntegerVT(NumBits);
4600	if (isTypeLegal(VT))
4601	return VT;
4602
4603	// PMOVMSKB can handle this.
4604	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4605	return MVT::v16i8;
4606
4607	// VPMOVMSKB can handle this.
4608	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4609	return MVT::v32i8;
4610
4611	// TODO: Allow 64-bit type for 32-bit target.
4612	// TODO: 512-bit types should be allowed, but make sure that those
4613	// cases are handled in combineVectorSizedSetCCEquality().
4614
4615	return MVT::INVALID_SIMPLE_VALUE_TYPE;
4616	}
4617
4618	/// Val is the undef sentinel value or equal to the specified value.
4619	static bool isUndefOrEqual(int Val, int CmpVal) {
4620	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
4621	}
4622
4623	/// Val is either the undef or zero sentinel value.
4624	static bool isUndefOrZero(int Val) {
4625	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
4626	}
4627
4628	/// Return true if every element in Mask, beginning
4629	/// from position Pos and ending in Pos+Size is the undef sentinel value.
4630	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4631	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4632	if (Mask[i] != SM_SentinelUndef)
4633	return false;
4634	return true;
4635	}
4636
4637	/// Return true if Val is undef or if its value falls within the
4638	/// specified range (L, H].
4639	static bool isUndefOrInRange(int Val, int Low, int Hi) {
4640	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
4641	}
4642
4643	/// Return true if every element in Mask is undef or if its value
4644	/// falls within the specified range (L, H].
4645	static bool isUndefOrInRange(ArrayRef<int> Mask,
4646	int Low, int Hi) {
4647	for (int M : Mask)
4648	if (!isUndefOrInRange(M, Low, Hi))
4649	return false;
4650	return true;
4651	}
4652
4653	/// Return true if Val is undef, zero or if its value falls within the
4654	/// specified range (L, H].
4655	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4656	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
4657	}
4658
4659	/// Return true if every element in Mask is undef, zero or if its value
4660	/// falls within the specified range (L, H].
4661	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4662	for (int M : Mask)
4663	if (!isUndefOrZeroOrInRange(M, Low, Hi))
4664	return false;
4665	return true;
4666	}
4667
4668	/// Return true if every element in Mask, beginning
4669	/// from position Pos and ending in Pos+Size, falls within the specified
4670	/// sequential range (Low, Low+Size]. or is undef.
4671	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4672	unsigned Pos, unsigned Size, int Low) {
4673	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4674	if (!isUndefOrEqual(Mask[i], Low))
4675	return false;
4676	return true;
4677	}
4678
4679	/// Return true if every element in Mask, beginning
4680	/// from position Pos and ending in Pos+Size, falls within the specified
4681	/// sequential range (Low, Low+Size], or is undef or is zero.
4682	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4683	unsigned Size, int Low) {
4684	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4685	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4686	return false;
4687	return true;
4688	}
4689
4690	/// Return true if every element in Mask, beginning
4691	/// from position Pos and ending in Pos+Size is undef or is zero.
4692	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4693	unsigned Size) {
4694	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4695	if (!isUndefOrZero(Mask[i]))
4696	return false;
4697	return true;
4698	}
4699
4700	/// \brief Helper function to test whether a shuffle mask could be
4701	/// simplified by widening the elements being shuffled.
4702	///
4703	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4704	/// leaves it in an unspecified state.
4705	///
4706	/// NOTE: This must handle normal vector shuffle masks and target vector
4707	/// shuffle masks. The latter have the special property of a '-2' representing
4708	/// a zero-ed lane of a vector.
4709	static bool canWidenShuffleElements(ArrayRef<int> Mask,
4710	SmallVectorImpl<int> &WidenedMask) {
4711	WidenedMask.assign(Mask.size() / 2, 0);
4712	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4713	int M0 = Mask[i];
4714	int M1 = Mask[i + 1];
4715
4716	// If both elements are undef, its trivial.
4717	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4718	WidenedMask[i / 2] = SM_SentinelUndef;
4719	continue;
4720	}
4721
4722	// Check for an undef mask and a mask value properly aligned to fit with
4723	// a pair of values. If we find such a case, use the non-undef mask's value.
4724	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4725	WidenedMask[i / 2] = M1 / 2;
4726	continue;
4727	}
4728	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4729	WidenedMask[i / 2] = M0 / 2;
4730	continue;
4731	}
4732
4733	// When zeroing, we need to spread the zeroing across both lanes to widen.
4734	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
4735	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
4736	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
4737	WidenedMask[i / 2] = SM_SentinelZero;
4738	continue;
4739	}
4740	return false;
4741	}
4742
4743	// Finally check if the two mask values are adjacent and aligned with
4744	// a pair.
4745	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4746	WidenedMask[i / 2] = M0 / 2;
4747	continue;
4748	}
4749
4750	// Otherwise we can't safely widen the elements used in this shuffle.
4751	return false;
4752	}
4753	assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!" ) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4754, __PRETTY_FUNCTION__))
4754	"Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!" ) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4754, __PRETTY_FUNCTION__));
4755
4756	return true;
4757	}
4758
4759	/// Helper function to scale a shuffle or target shuffle mask, replacing each
4760	/// mask index with the scaled sequential indices for an equivalent narrowed
4761	/// mask. This is the reverse process to canWidenShuffleElements, but can always
4762	/// succeed.
4763	static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4764	SmallVectorImpl<int> &ScaledMask) {
4765	assert(0 < Scale && "Unexpected scaling factor")((0 < Scale && "Unexpected scaling factor") ? static_cast <void> (0) : __assert_fail ("0 < Scale && \"Unexpected scaling factor\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4765, __PRETTY_FUNCTION__));
4766	int NumElts = Mask.size();
4767	ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4768
4769	for (int i = 0; i != NumElts; ++i) {
4770	int M = Mask[i];
4771
4772	// Repeat sentinel values in every mask element.
4773	if (M < 0) {
4774	for (int s = 0; s != Scale; ++s)
4775	ScaledMask[(Scale * i) + s] = M;
4776	continue;
4777	}
4778
4779	// Scale mask element and increment across each mask element.
4780	for (int s = 0; s != Scale; ++s)
4781	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4782	}
4783	}
4784
4785	/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4786	/// extract that is suitable for instruction that extract 128 or 256 bit vectors
4787	static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4788	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width")(((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width" ) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 \|\| vecWidth == 256) && \"Unexpected vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4788, __PRETTY_FUNCTION__));
4789	if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4790	return false;
4791
4792	// The index should be aligned on a vecWidth-bit boundary.
4793	uint64_t Index = N->getConstantOperandVal(1);
4794	MVT VT = N->getSimpleValueType(0);
4795	unsigned ElSize = VT.getScalarSizeInBits();
4796	return (Index * ElSize) % vecWidth == 0;
4797	}
4798
4799	/// Return true if the specified INSERT_SUBVECTOR
4800	/// operand specifies a subvector insert that is suitable for input to
4801	/// insertion of 128 or 256-bit subvectors
4802	static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4803	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width")(((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width" ) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 \|\| vecWidth == 256) && \"Unexpected vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4803, __PRETTY_FUNCTION__));
4804	if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4805	return false;
4806
4807	// The index should be aligned on a vecWidth-bit boundary.
4808	uint64_t Index = N->getConstantOperandVal(2);
4809	MVT VT = N->getSimpleValueType(0);
4810	unsigned ElSize = VT.getScalarSizeInBits();
4811	return (Index * ElSize) % vecWidth == 0;
4812	}
4813
4814	bool X86::isVINSERT128Index(SDNode *N) {
4815	return isVINSERTIndex(N, 128);
4816	}
4817
4818	bool X86::isVINSERT256Index(SDNode *N) {
4819	return isVINSERTIndex(N, 256);
4820	}
4821
4822	bool X86::isVEXTRACT128Index(SDNode *N) {
4823	return isVEXTRACTIndex(N, 128);
4824	}
4825
4826	bool X86::isVEXTRACT256Index(SDNode *N) {
4827	return isVEXTRACTIndex(N, 256);
4828	}
4829
4830	static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4831	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width")(((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width" ) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 \|\| vecWidth == 256) && \"Unsupported vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4831, __PRETTY_FUNCTION__));
4832	assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&((isa<ConstantSDNode>(N->getOperand(1).getNode()) && "Illegal extract subvector for VEXTRACT") ? static_cast<void > (0) : __assert_fail ("isa<ConstantSDNode>(N->getOperand(1).getNode()) && \"Illegal extract subvector for VEXTRACT\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4833, __PRETTY_FUNCTION__))
4833	"Illegal extract subvector for VEXTRACT")((isa<ConstantSDNode>(N->getOperand(1).getNode()) && "Illegal extract subvector for VEXTRACT") ? static_cast<void > (0) : __assert_fail ("isa<ConstantSDNode>(N->getOperand(1).getNode()) && \"Illegal extract subvector for VEXTRACT\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4833, __PRETTY_FUNCTION__));
4834
4835	uint64_t Index = N->getConstantOperandVal(1);
4836	MVT VecVT = N->getOperand(0).getSimpleValueType();
4837	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4838	return Index / NumElemsPerChunk;
4839	}
4840
4841	static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4842	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width")(((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width" ) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 \|\| vecWidth == 256) && \"Unsupported vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4842, __PRETTY_FUNCTION__));
4843	assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&((isa<ConstantSDNode>(N->getOperand(2).getNode()) && "Illegal insert subvector for VINSERT") ? static_cast<void > (0) : __assert_fail ("isa<ConstantSDNode>(N->getOperand(2).getNode()) && \"Illegal insert subvector for VINSERT\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4844, __PRETTY_FUNCTION__))
4844	"Illegal insert subvector for VINSERT")((isa<ConstantSDNode>(N->getOperand(2).getNode()) && "Illegal insert subvector for VINSERT") ? static_cast<void > (0) : __assert_fail ("isa<ConstantSDNode>(N->getOperand(2).getNode()) && \"Illegal insert subvector for VINSERT\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4844, __PRETTY_FUNCTION__));
4845
4846	uint64_t Index = N->getConstantOperandVal(2);
4847	MVT VecVT = N->getSimpleValueType(0);
4848	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4849	return Index / NumElemsPerChunk;
4850	}
4851
4852	/// Return the appropriate immediate to extract the specified
4853	/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4854	unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4855	return getExtractVEXTRACTImmediate(N, 128);
4856	}
4857
4858	/// Return the appropriate immediate to extract the specified
4859	/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4860	unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4861	return getExtractVEXTRACTImmediate(N, 256);
4862	}
4863
4864	/// Return the appropriate immediate to insert at the specified
4865	/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4866	unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4867	return getInsertVINSERTImmediate(N, 128);
4868	}
4869
4870	/// Return the appropriate immediate to insert at the specified
4871	/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4872	unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4873	return getInsertVINSERTImmediate(N, 256);
4874	}
4875
4876	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4877	bool X86::isZeroNode(SDValue Elt) {
4878	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
4879	}
4880
4881	// Build a vector of constants.
4882	// Use an UNDEF node if MaskElt == -1.
4883	// Split 64-bit constants in the 32-bit mode.
4884	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4885	const SDLoc &dl, bool IsMask = false) {
4886
4887	SmallVector<SDValue, 32> Ops;
4888	bool Split = false;
4889
4890	MVT ConstVecVT = VT;
4891	unsigned NumElts = VT.getVectorNumElements();
4892	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4893	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4894	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4895	Split = true;
4896	}
4897
4898	MVT EltVT = ConstVecVT.getVectorElementType();
4899	for (unsigned i = 0; i < NumElts; ++i) {
4900	bool IsUndef = Values[i] < 0 && IsMask;
4901	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4902	DAG.getConstant(Values[i], dl, EltVT);
4903	Ops.push_back(OpNode);
4904	if (Split)
4905	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4906	DAG.getConstant(0, dl, EltVT));
4907	}
4908	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4909	if (Split)
4910	ConstsNode = DAG.getBitcast(VT, ConstsNode);
4911	return ConstsNode;
4912	}
4913
4914	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4915	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4916	assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays" ) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4917, __PRETTY_FUNCTION__))
4917	"Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays" ) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4917, __PRETTY_FUNCTION__));
4918	SmallVector<SDValue, 32> Ops;
4919	bool Split = false;
4920
4921	MVT ConstVecVT = VT;
4922	unsigned NumElts = VT.getVectorNumElements();
4923	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4924	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4925	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4926	Split = true;
4927	}
4928
4929	MVT EltVT = ConstVecVT.getVectorElementType();
4930	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4931	if (Undefs[i]) {
4932	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4933	continue;
4934	}
4935	const APInt &V = Bits[i];
4936	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes" ) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4936, __PRETTY_FUNCTION__));
4937	if (Split) {
4938	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4939	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4940	} else if (EltVT == MVT::f32) {
4941	APFloat FV(APFloat::IEEEsingle(), V);
4942	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4943	} else if (EltVT == MVT::f64) {
4944	APFloat FV(APFloat::IEEEdouble(), V);
4945	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4946	} else {
4947	Ops.push_back(DAG.getConstant(V, dl, EltVT));
4948	}
4949	}
4950
4951	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4952	return DAG.getBitcast(VT, ConstsNode);
4953	}
4954
4955	/// Returns a vector of specified type with all zero elements.
4956	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4957	SelectionDAG &DAG, const SDLoc &dl) {
4958	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector () \|\| VT.getVectorElementType() == MVT::i1) && "Unexpected vector type" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\| VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4960, __PRETTY_FUNCTION__))
4959	VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector () \|\| VT.getVectorElementType() == MVT::i1) && "Unexpected vector type" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\| VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4960, __PRETTY_FUNCTION__))
4960	"Unexpected vector type")(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector () \|\| VT.getVectorElementType() == MVT::i1) && "Unexpected vector type" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\| VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4960, __PRETTY_FUNCTION__));
4961
4962	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4963	// type. This ensures they get CSE'd. But if the integer type is not
4964	// available, use a floating-point +0.0 instead.
4965	SDValue Vec;
4966	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4967	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4968	} else if (VT.getVectorElementType() == MVT::i1) {
4969	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) && "Unexpected vector type") ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4970, __PRETTY_FUNCTION__))
4970	"Unexpected vector type")(((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) && "Unexpected vector type") ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4970, __PRETTY_FUNCTION__));
4971	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&(((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) && "Unexpected vector type") ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4972, __PRETTY_FUNCTION__))
4972	"Unexpected vector type")(((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) && "Unexpected vector type") ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4972, __PRETTY_FUNCTION__));
4973	Vec = DAG.getConstant(0, dl, VT);
4974	} else {
4975	unsigned Num32BitElts = VT.getSizeInBits() / 32;
4976	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4977	}
4978	return DAG.getBitcast(VT, Vec);
4979	}
4980
4981	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4982	const SDLoc &dl, unsigned vectorWidth) {
4983	EVT VT = Vec.getValueType();
4984	EVT ElVT = VT.getVectorElementType();
4985	unsigned Factor = VT.getSizeInBits()/vectorWidth;
4986	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4987	VT.getVectorNumElements()/Factor);
4988
4989	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4990	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4991	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 4991, __PRETTY_FUNCTION__));
4992
4993	// This is the index of the first element of the vectorWidth-bit chunk
4994	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4995	IdxVal &= ~(ElemsPerChunk - 1);
4996
4997	// If the input is a buildvector just emit a smaller one.
4998	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4999	return DAG.getBuildVector(
5000	ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5001
5002	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5003	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5004	}
5005
5006	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5007	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5008	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5009	/// instructions or a simple subregister reference. Idx is an index in the
5010	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5011	/// lowering EXTRACT_VECTOR_ELT operations easier.
5012	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5013	SelectionDAG &DAG, const SDLoc &dl) {
5014	assert((Vec.getValueType().is256BitVector() \|\|(((Vec.getValueType().is256BitVector() \|\| Vec.getValueType(). is512BitVector()) && "Unexpected vector size!") ? static_cast <void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() \|\| Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5015, __PRETTY_FUNCTION__))
5015	Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() \|\| Vec.getValueType(). is512BitVector()) && "Unexpected vector size!") ? static_cast <void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() \|\| Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5015, __PRETTY_FUNCTION__));
5016	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5017	}
5018
5019	/// Generate a DAG to grab 256-bits from a 512-bit vector.
5020	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5021	SelectionDAG &DAG, const SDLoc &dl) {
5022	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!" ) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5022, __PRETTY_FUNCTION__));
5023	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5024	}
5025
5026	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5027	SelectionDAG &DAG, const SDLoc &dl,
5028	unsigned vectorWidth) {
5029	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&(((vectorWidth == 128 \|\| vectorWidth == 256) && "Unsupported vector width" ) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 \|\| vectorWidth == 256) && \"Unsupported vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5030, __PRETTY_FUNCTION__))
5030	"Unsupported vector width")(((vectorWidth == 128 \|\| vectorWidth == 256) && "Unsupported vector width" ) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 \|\| vectorWidth == 256) && \"Unsupported vector width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5030, __PRETTY_FUNCTION__));
5031	// Inserting UNDEF is Result
5032	if (Vec.isUndef())
5033	return Result;
5034	EVT VT = Vec.getValueType();
5035	EVT ElVT = VT.getVectorElementType();
5036	EVT ResultVT = Result.getValueType();
5037
5038	// Insert the relevant vectorWidth bits.
5039	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5040	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5040, __PRETTY_FUNCTION__));
5041
5042	// This is the index of the first element of the vectorWidth-bit chunk
5043	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5044	IdxVal &= ~(ElemsPerChunk - 1);
5045
5046	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5047	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5048	}
5049
5050	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5051	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5052	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5053	/// simple superregister reference. Idx is an index in the 128 bits
5054	/// we want. It need not be aligned to a 128-bit boundary. That makes
5055	/// lowering INSERT_VECTOR_ELT operations easier.
5056	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5057	SelectionDAG &DAG, const SDLoc &dl) {
5058	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!" ) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5058, __PRETTY_FUNCTION__));
5059	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5060	}
5061
5062	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5063	SelectionDAG &DAG, const SDLoc &dl) {
5064	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!")((Vec.getValueType().is256BitVector() && "Unexpected vector size!" ) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is256BitVector() && \"Unexpected vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5064, __PRETTY_FUNCTION__));
5065	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5066	}
5067
5068	// Return true if the instruction zeroes the unused upper part of the
5069	// destination and accepts mask.
5070	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5071	switch (Opcode) {
5072	default:
5073	return false;
5074	case X86ISD::PCMPEQM:
5075	case X86ISD::PCMPGTM:
5076	case X86ISD::CMPM:
5077	case X86ISD::CMPMU:
5078	return true;
5079	}
5080	}
5081
5082	/// Insert i1-subvector to i1-vector.
5083	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5084	const X86Subtarget &Subtarget) {
5085
5086	SDLoc dl(Op);
5087	SDValue Vec = Op.getOperand(0);
5088	SDValue SubVec = Op.getOperand(1);
5089	SDValue Idx = Op.getOperand(2);
5090
5091	if (!isa<ConstantSDNode>(Idx))
5092	return SDValue();
5093
5094	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5095	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5096	return Op;
5097
5098	MVT OpVT = Op.getSimpleValueType();
5099	MVT SubVecVT = SubVec.getSimpleValueType();
5100	unsigned NumElems = OpVT.getVectorNumElements();
5101	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5102
5103	assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT .getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR" ) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5105, __PRETTY_FUNCTION__))
5104	IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT .getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR" ) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5105, __PRETTY_FUNCTION__))
5105	"Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT .getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR" ) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5105, __PRETTY_FUNCTION__));
5106
5107	// There are 3 possible cases:
5108	// 1. Subvector should be inserted in the lower part (IdxVal == 0)
5109	// 2. Subvector should be inserted in the upper part
5110	// (IdxVal + SubVecNumElems == NumElems)
5111	// 3. Subvector should be inserted in the middle (for example v2i1
5112	// to v16i1, index 2)
5113
5114	// If this node widens - by concatenating zeroes - the type of the result
5115	// of a node with instruction that zeroes all upper (irrelevant) bits of the
5116	// output register, mark this node as legal to enable replacing them with
5117	// the v8i1 version of the previous instruction during instruction selection.
5118	// For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5119	// while zeroing all the upper remaining 60 bits of the register. if the
5120	// result of such instruction is inserted into an allZeroVector, then we can
5121	// safely remove insert_vector (in instruction selection) as the cmp instr
5122	// already zeroed the rest of the register.
5123	if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5124	(isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) \|\|
5125	(SubVec.getOpcode() == ISD::AND &&
5126	(isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) \|\|
5127	isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5128	return Op;
5129
5130	// extend to natively supported kshift
5131	MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5132	MVT WideOpVT = OpVT;
5133	if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5134	WideOpVT = MinVT;
5135
5136	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5137	SDValue Undef = DAG.getUNDEF(WideOpVT);
5138	SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5139	Undef, SubVec, ZeroIdx);
5140
5141	// Extract sub-vector if require.
5142	auto ExtractSubVec = [&](SDValue V) {
5143	return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5144	OpVT, V, ZeroIdx);
5145	};
5146
5147	if (Vec.isUndef()) {
5148	if (IdxVal != 0) {
5149	SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5150	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5151	ShiftBits);
5152	}
5153	return ExtractSubVec(WideSubVec);
5154	}
5155
5156	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5157	NumElems = WideOpVT.getVectorNumElements();
5158	unsigned ShiftLeft = NumElems - SubVecNumElems;
5159	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5160	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5161	DAG.getConstant(ShiftLeft, dl, MVT::i8));
5162	Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5163	DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5164	return ExtractSubVec(Vec);
5165	}
5166
5167	if (IdxVal == 0) {
5168	// Zero lower bits of the Vec
5169	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5170	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5171	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5172	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5173	// Merge them together, SubVec should be zero extended.
5174	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5175	getZeroVector(WideOpVT, Subtarget, DAG, dl),
5176	SubVec, ZeroIdx);
5177	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5178	return ExtractSubVec(Vec);
5179	}
5180
5181	// Simple case when we put subvector in the upper part
5182	if (IdxVal + SubVecNumElems == NumElems) {
5183	// Zero upper bits of the Vec
5184	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5185	DAG.getConstant(IdxVal, dl, MVT::i8));
5186	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5187	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5188	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5189	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5190	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5191	return ExtractSubVec(Vec);
5192	}
5193	// Subvector should be inserted in the middle - use shuffle
5194	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5195	SubVec, ZeroIdx);
5196	SmallVector<int, 64> Mask;
5197	for (unsigned i = 0; i < NumElems; ++i)
5198	Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5199	i : i + NumElems);
5200	return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5201	}
5202
5203	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5204	/// instructions. This is used because creating CONCAT_VECTOR nodes of
5205	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5206	/// large BUILD_VECTORS.
5207	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5208	unsigned NumElems, SelectionDAG &DAG,
5209	const SDLoc &dl) {
5210	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5211	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5212	}
5213
5214	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5215	unsigned NumElems, SelectionDAG &DAG,
5216	const SDLoc &dl) {
5217	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5218	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5219	}
5220
5221	/// Returns a vector of specified type with all bits set.
5222	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5223	/// Then bitcast to their original type, ensuring they get CSE'd.
5224	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5225	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Expected a 128/256/512-bit vector type") ? static_cast <void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5226, __PRETTY_FUNCTION__))
5226	"Expected a 128/256/512-bit vector type")(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Expected a 128/256/512-bit vector type") ? static_cast <void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5226, __PRETTY_FUNCTION__));
5227
5228	APInt Ones = APInt::getAllOnesValue(32);
5229	unsigned NumElts = VT.getSizeInBits() / 32;
5230	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5231	return DAG.getBitcast(VT, Vec);
5232	}
5233
5234	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5235	SelectionDAG &DAG) {
5236	EVT InVT = In.getValueType();
5237	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode")(((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode" ) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && \"Unexpected opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5237, __PRETTY_FUNCTION__));
5238
5239	if (VT.is128BitVector() && InVT.is128BitVector())
5240	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5241	: DAG.getZeroExtendVectorInReg(In, DL, VT);
5242
5243	// For 256-bit vectors, we only need the lower (128-bit) input half.
5244	// For 512-bit vectors, we only need the lower input half or quarter.
5245	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5246	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5247	In = extractSubVector(In, 0, DAG, DL,
5248	std::max(128, (int)VT.getSizeInBits() / Scale));
5249	}
5250
5251	return DAG.getNode(Opc, DL, VT, In);
5252	}
5253
5254	/// Generate unpacklo/unpackhi shuffle mask.
5255	static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5256	bool Unary) {
5257	assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector" ) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5257, __PRETTY_FUNCTION__));
5258	int NumElts = VT.getVectorNumElements();
5259	int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5260
5261	for (int i = 0; i < NumElts; ++i) {
5262	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5263	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5264	Pos += (Unary ? 0 : NumElts * (i % 2));
5265	Pos += (Lo ? 0 : NumEltsInLane / 2);
5266	Mask.push_back(Pos);
5267	}
5268	}
5269
5270	/// Returns a vector_shuffle node for an unpackl operation.
5271	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5272	SDValue V1, SDValue V2) {
5273	SmallVector<int, 8> Mask;
5274	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
5275	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5276	}
5277
5278	/// Returns a vector_shuffle node for an unpackh operation.
5279	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5280	SDValue V1, SDValue V2) {
5281	SmallVector<int, 8> Mask;
5282	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
5283	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5284	}
5285
5286	/// Return a vector_shuffle of the specified vector of zero or undef vector.
5287	/// This produces a shuffle where the low element of V2 is swizzled into the
5288	/// zero/undef vector, landing at element Idx.
5289	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5290	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5291	bool IsZero,
5292	const X86Subtarget &Subtarget,
5293	SelectionDAG &DAG) {
5294	MVT VT = V2.getSimpleValueType();
5295	SDValue V1 = IsZero
5296	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5297	int NumElems = VT.getVectorNumElements();
5298	SmallVector<int, 16> MaskVec(NumElems);
5299	for (int i = 0; i != NumElems; ++i)
5300	// If this is the insertion idx, put the low elt of V2 here.
5301	MaskVec[i] = (i == Idx) ? NumElems : i;
5302	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5303	}
5304
5305	static SDValue peekThroughBitcasts(SDValue V) {
5306	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5307	V = V.getOperand(0);
5308	return V;
5309	}
5310
5311	static SDValue peekThroughOneUseBitcasts(SDValue V) {
5312	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5313	V.getOperand(0).hasOneUse())
5314	V = V.getOperand(0);
5315	return V;
5316	}
5317
5318	static const Constant *getTargetConstantFromNode(SDValue Op) {
5319	Op = peekThroughBitcasts(Op);
5320
5321	auto *Load = dyn_cast<LoadSDNode>(Op);
5322	if (!Load)
5323	return nullptr;
5324
5325	SDValue Ptr = Load->getBasePtr();
5326	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
5327	Ptr->getOpcode() == X86ISD::WrapperRIP)
5328	Ptr = Ptr->getOperand(0);
5329
5330	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5331	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
5332	return nullptr;
5333
5334	return dyn_cast<Constant>(CNode->getConstVal());
5335	}
5336
5337	// Extract raw constant bits from constant pools.
5338	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5339	APInt &UndefElts,
5340	SmallVectorImpl<APInt> &EltBits,
5341	bool AllowWholeUndefs = true,
5342	bool AllowPartialUndefs = true) {
5343	assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector" ) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5343, __PRETTY_FUNCTION__));
5344
5345	Op = peekThroughBitcasts(Op);
5346
5347	EVT VT = Op.getValueType();
5348	unsigned SizeInBits = VT.getSizeInBits();
5349	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!" ) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5349, __PRETTY_FUNCTION__));
5350	unsigned NumElts = SizeInBits / EltSizeInBits;
5351
5352	// Bitcast a source array of element bits to the target size.
5353	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5354	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5355	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5356	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match" ) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5357, __PRETTY_FUNCTION__))
5357	"Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match" ) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5357, __PRETTY_FUNCTION__));
5358
5359	// Don't split if we don't allow undef bits.
5360	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
5361	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5362	return false;
5363
5364	// If we're already the right size, don't bother bitcasting.
5365	if (NumSrcElts == NumElts) {
5366	UndefElts = UndefSrcElts;
5367	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5368	return true;
5369	}
5370
5371	// Extract all the undef/constant element data and pack into single bitsets.
5372	APInt UndefBits(SizeInBits, 0);
5373	APInt MaskBits(SizeInBits, 0);
5374
5375	for (unsigned i = 0; i != NumSrcElts; ++i) {
5376	unsigned BitOffset = i * SrcEltSizeInBits;
5377	if (UndefSrcElts[i])
5378	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5379	MaskBits.insertBits(SrcEltBits[i], BitOffset);
5380	}
5381
5382	// Split the undef/constant single bitset data into the target elements.
5383	UndefElts = APInt(NumElts, 0);
5384	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5385
5386	for (unsigned i = 0; i != NumElts; ++i) {
5387	unsigned BitOffset = i * EltSizeInBits;
5388	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5389
5390	// Only treat an element as UNDEF if all bits are UNDEF.
5391	if (UndefEltBits.isAllOnesValue()) {
5392	if (!AllowWholeUndefs)
5393	return false;
5394	UndefElts.setBit(i);
5395	continue;
5396	}
5397
5398	// If only some bits are UNDEF then treat them as zero (or bail if not
5399	// supported).
5400	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5401	return false;
5402
5403	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5404	EltBits[i] = Bits.getZExtValue();
5405	}
5406	return true;
5407	};
5408
5409	// Collect constant bits and insert into mask/undef bit masks.
5410	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5411	unsigned UndefBitIndex) {
5412	if (!Cst)
5413	return false;
5414	if (isa<UndefValue>(Cst)) {
5415	Undefs.setBit(UndefBitIndex);
5416	return true;
5417	}
5418	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5419	Mask = CInt->getValue();
5420	return true;
5421	}
5422	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5423	Mask = CFP->getValueAPF().bitcastToAPInt();
5424	return true;
5425	}
5426	return false;
5427	};
5428
5429	// Extract constant bits from build vector.
5430	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5431	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5432	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5433
5434	APInt UndefSrcElts(NumSrcElts, 0);
5435	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5436	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5437	const SDValue &Src = Op.getOperand(i);
5438	if (Src.isUndef()) {
5439	UndefSrcElts.setBit(i);
5440	continue;
5441	}
5442	auto *Cst = cast<ConstantSDNode>(Src);
5443	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5444	}
5445	return CastBitData(UndefSrcElts, SrcEltBits);
5446	}
5447
5448	// Extract constant bits from constant pool vector.
5449	if (auto *Cst = getTargetConstantFromNode(Op)) {
5450	Type *CstTy = Cst->getType();
5451	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5452	return false;
5453
5454	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5455	unsigned NumSrcElts = CstTy->getVectorNumElements();
5456
5457	APInt UndefSrcElts(NumSrcElts, 0);
5458	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5459	for (unsigned i = 0; i != NumSrcElts; ++i)
5460	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5461	UndefSrcElts, i))
5462	return false;
5463
5464	return CastBitData(UndefSrcElts, SrcEltBits);
5465	}
5466
5467	// Extract constant bits from a broadcasted constant pool scalar.
5468	if (Op.getOpcode() == X86ISD::VBROADCAST &&
5469	EltSizeInBits <= VT.getScalarSizeInBits()) {
5470	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5471	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5472	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5473
5474	APInt UndefSrcElts(NumSrcElts, 0);
5475	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5476	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5477	if (UndefSrcElts[0])
5478	UndefSrcElts.setBits(0, NumSrcElts);
5479	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5480	return CastBitData(UndefSrcElts, SrcEltBits);
5481	}
5482	}
5483	}
5484
5485	// Extract a rematerialized scalar constant insertion.
5486	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5487	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5488	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5489	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5490	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5491
5492	APInt UndefSrcElts(NumSrcElts, 0);
5493	SmallVector<APInt, 64> SrcEltBits;
5494	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5495	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5496	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5497	return CastBitData(UndefSrcElts, SrcEltBits);
5498	}
5499
5500	return false;
5501	}
5502
5503	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5504	unsigned MaskEltSizeInBits,
5505	SmallVectorImpl<uint64_t> &RawMask) {
5506	APInt UndefElts;
5507	SmallVector<APInt, 64> EltBits;
5508
5509	// Extract the raw target constant bits.
5510	// FIXME: We currently don't support UNDEF bits or mask entries.
5511	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5512	EltBits, /* AllowWholeUndefs */ false,
5513	/* AllowPartialUndefs */ false))
5514	return false;
5515
5516	// Insert the extracted elements into the mask.
5517	for (APInt Elt : EltBits)
5518	RawMask.push_back(Elt.getZExtValue());
5519
5520	return true;
5521	}
5522
5523	/// Calculates the shuffle mask corresponding to the target-specific opcode.
5524	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5525	/// operands in \p Ops, and returns true.
5526	/// Sets \p IsUnary to true if only one source is used. Note that this will set
5527	/// IsUnary for shuffles which use a single input multiple times, and in those
5528	/// cases it will adjust the mask to only have indices within that single input.
5529	/// It is an error to call this with non-empty Mask/Ops vectors.
5530	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5531	SmallVectorImpl<SDValue> &Ops,
5532	SmallVectorImpl<int> &Mask, bool &IsUnary) {
5533	unsigned NumElems = VT.getVectorNumElements();
5534	SDValue ImmN;
5535
5536	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector" ) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5536, __PRETTY_FUNCTION__));
5537	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector" ) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5537, __PRETTY_FUNCTION__));
5538
5539	IsUnary = false;
5540	bool IsFakeUnary = false;
5541	switch(N->getOpcode()) {
5542	case X86ISD::BLENDI:
5543	ImmN = N->getOperand(N->getNumOperands()-1);
5544	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5545	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5546	break;
5547	case X86ISD::SHUFP:
5548	ImmN = N->getOperand(N->getNumOperands()-1);
5549	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5550	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5551	break;
5552	case X86ISD::INSERTPS:
5553	ImmN = N->getOperand(N->getNumOperands()-1);
5554	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5555	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5556	break;
5557	case X86ISD::UNPCKH:
5558	DecodeUNPCKHMask(VT, Mask);
5559	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5560	break;
5561	case X86ISD::UNPCKL:
5562	DecodeUNPCKLMask(VT, Mask);
5563	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5564	break;
5565	case X86ISD::MOVHLPS:
5566	DecodeMOVHLPSMask(NumElems, Mask);
5567	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5568	break;
5569	case X86ISD::MOVLHPS:
5570	DecodeMOVLHPSMask(NumElems, Mask);
5571	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5572	break;
5573	case X86ISD::PALIGNR:
5574	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5574, __PRETTY_FUNCTION__));
5575	ImmN = N->getOperand(N->getNumOperands()-1);
5576	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5577	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5578	Ops.push_back(N->getOperand(1));
5579	Ops.push_back(N->getOperand(0));
5580	break;
5581	case X86ISD::VSHLDQ:
5582	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5582, __PRETTY_FUNCTION__));
5583	ImmN = N->getOperand(N->getNumOperands() - 1);
5584	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5585	IsUnary = true;
5586	break;
5587	case X86ISD::VSRLDQ:
5588	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5588, __PRETTY_FUNCTION__));
5589	ImmN = N->getOperand(N->getNumOperands() - 1);
5590	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5591	IsUnary = true;
5592	break;
5593	case X86ISD::PSHUFD:
5594	case X86ISD::VPERMILPI:
5595	ImmN = N->getOperand(N->getNumOperands()-1);
5596	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5597	IsUnary = true;
5598	break;
5599	case X86ISD::PSHUFHW:
5600	ImmN = N->getOperand(N->getNumOperands()-1);
5601	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5602	IsUnary = true;
5603	break;
5604	case X86ISD::PSHUFLW:
5605	ImmN = N->getOperand(N->getNumOperands()-1);
5606	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5607	IsUnary = true;
5608	break;
5609	case X86ISD::VZEXT_MOVL:
5610	DecodeZeroMoveLowMask(VT, Mask);
5611	IsUnary = true;
5612	break;
5613	case X86ISD::VBROADCAST: {
5614	SDValue N0 = N->getOperand(0);
5615	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5616	// add the pre-extracted value to the Ops vector.
5617	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5618	N0.getOperand(0).getValueType() == VT &&
5619	N0.getConstantOperandVal(1) == 0)
5620	Ops.push_back(N0.getOperand(0));
5621
5622	// We only decode broadcasts of same-sized vectors, unless the broadcast
5623	// came from an extract from the original width. If we found one, we
5624	// pushed it the Ops vector above.
5625	if (N0.getValueType() == VT \|\| !Ops.empty()) {
5626	DecodeVectorBroadcast(VT, Mask);
5627	IsUnary = true;
5628	break;
5629	}
5630	return false;
5631	}
5632	case X86ISD::VPERMILPV: {
5633	IsUnary = true;
5634	SDValue MaskNode = N->getOperand(1);
5635	unsigned MaskEltSize = VT.getScalarSizeInBits();
5636	SmallVector<uint64_t, 32> RawMask;
5637	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5638	DecodeVPERMILPMask(VT, RawMask, Mask);
5639	break;
5640	}
5641	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5642	DecodeVPERMILPMask(C, MaskEltSize, Mask);
5643	break;
5644	}
5645	return false;
5646	}
5647	case X86ISD::PSHUFB: {
5648	IsUnary = true;
5649	SDValue MaskNode = N->getOperand(1);
5650	SmallVector<uint64_t, 32> RawMask;
5651	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5652	DecodePSHUFBMask(RawMask, Mask);
5653	break;
5654	}
5655	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5656	DecodePSHUFBMask(C, Mask);
5657	break;
5658	}
5659	return false;
5660	}
5661	case X86ISD::VPERMI:
5662	ImmN = N->getOperand(N->getNumOperands()-1);
5663	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5664	IsUnary = true;
5665	break;
5666	case X86ISD::MOVSS:
5667	case X86ISD::MOVSD:
5668	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5669	break;
5670	case X86ISD::VPERM2X128:
5671	ImmN = N->getOperand(N->getNumOperands()-1);
5672	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5673	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5674	break;
5675	case X86ISD::MOVSLDUP:
5676	DecodeMOVSLDUPMask(VT, Mask);
5677	IsUnary = true;
5678	break;
5679	case X86ISD::MOVSHDUP:
5680	DecodeMOVSHDUPMask(VT, Mask);
5681	IsUnary = true;
5682	break;
5683	case X86ISD::MOVDDUP:
5684	DecodeMOVDDUPMask(VT, Mask);
5685	IsUnary = true;
5686	break;
5687	case X86ISD::MOVLHPD:
5688	case X86ISD::MOVLPD:
5689	case X86ISD::MOVLPS:
5690	// Not yet implemented
5691	return false;
5692	case X86ISD::VPERMIL2: {
5693	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5694	unsigned MaskEltSize = VT.getScalarSizeInBits();
5695	SDValue MaskNode = N->getOperand(2);
5696	SDValue CtrlNode = N->getOperand(3);
5697	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5698	unsigned CtrlImm = CtrlOp->getZExtValue();
5699	SmallVector<uint64_t, 32> RawMask;
5700	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5701	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5702	break;
5703	}
5704	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5705	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5706	break;
5707	}
5708	}
5709	return false;
5710	}
5711	case X86ISD::VPPERM: {
5712	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5713	SDValue MaskNode = N->getOperand(2);
5714	SmallVector<uint64_t, 32> RawMask;
5715	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5716	DecodeVPPERMMask(RawMask, Mask);
5717	break;
5718	}
5719	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5720	DecodeVPPERMMask(C, Mask);
5721	break;
5722	}
5723	return false;
5724	}
5725	case X86ISD::VPERMV: {
5726	IsUnary = true;
5727	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5728	Ops.push_back(N->getOperand(1));
5729	SDValue MaskNode = N->getOperand(0);
5730	SmallVector<uint64_t, 32> RawMask;
5731	unsigned MaskEltSize = VT.getScalarSizeInBits();
5732	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5733	DecodeVPERMVMask(RawMask, Mask);
5734	break;
5735	}
5736	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5737	DecodeVPERMVMask(C, MaskEltSize, Mask);
5738	break;
5739	}
5740	return false;
5741	}
5742	case X86ISD::VPERMV3: {
5743	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5744	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5745	Ops.push_back(N->getOperand(0));
5746	Ops.push_back(N->getOperand(2));
5747	SDValue MaskNode = N->getOperand(1);
5748	unsigned MaskEltSize = VT.getScalarSizeInBits();
5749	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5750	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5751	break;
5752	}
5753	return false;
5754	}
5755	case X86ISD::VPERMIV3: {
5756	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5757	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5758	Ops.push_back(N->getOperand(1));
5759	Ops.push_back(N->getOperand(2));
5760	SDValue MaskNode = N->getOperand(0);
5761	unsigned MaskEltSize = VT.getScalarSizeInBits();
5762	if (auto *C = getTargetConstantFromNode(MaskNode)) {
5763	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5764	break;
5765	}
5766	return false;
5767	}
5768	default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5768);
5769	}
5770
5771	// Empty mask indicates the decode failed.
5772	if (Mask.empty())
5773	return false;
5774
5775	// Check if we're getting a shuffle mask with zero'd elements.
5776	if (!AllowSentinelZero)
5777	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5778	return false;
5779
5780	// If we have a fake unary shuffle, the shuffle mask is spread across two
5781	// inputs that are actually the same node. Re-map the mask to always point
5782	// into the first input.
5783	if (IsFakeUnary)
5784	for (int &M : Mask)
5785	if (M >= (int)Mask.size())
5786	M -= Mask.size();
5787
5788	// If we didn't already add operands in the opcode-specific code, default to
5789	// adding 1 or 2 operands starting at 0.
5790	if (Ops.empty()) {
5791	Ops.push_back(N->getOperand(0));
5792	if (!IsUnary \|\| IsFakeUnary)
5793	Ops.push_back(N->getOperand(1));
5794	}
5795
5796	return true;
5797	}
5798
5799	/// Check a target shuffle mask's inputs to see if we can set any values to
5800	/// SM_SentinelZero - this is for elements that are known to be zero
5801	/// (not just zeroable) from their inputs.
5802	/// Returns true if the target shuffle mask was decoded.
5803	static bool setTargetShuffleZeroElements(SDValue N,
5804	SmallVectorImpl<int> &Mask,
5805	SmallVectorImpl<SDValue> &Ops) {
5806	bool IsUnary;
5807	if (!isTargetShuffle(N.getOpcode()))
5808	return false;
5809
5810	MVT VT = N.getSimpleValueType();
5811	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5812	return false;
5813
5814	SDValue V1 = Ops[0];
5815	SDValue V2 = IsUnary ? V1 : Ops[1];
5816
5817	V1 = peekThroughBitcasts(V1);
5818	V2 = peekThroughBitcasts(V2);
5819
5820	assert((VT.getSizeInBits() % Mask.size()) == 0 &&(((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type" ) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5821, __PRETTY_FUNCTION__))
5821	"Illegal split of shuffle value type")(((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type" ) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5821, __PRETTY_FUNCTION__));
5822	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5823
5824	// Extract known constant input data.
5825	APInt UndefSrcElts[2];
5826	SmallVector<APInt, 32> SrcEltBits[2];
5827	bool IsSrcConstant[2] = {
5828	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5829	SrcEltBits[0], true, false),
5830	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5831	SrcEltBits[1], true, false)};
5832
5833	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5834	int M = Mask[i];
5835
5836	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
5837	if (M < 0)
5838	continue;
5839
5840	// Determine shuffle input and normalize the mask.
5841	unsigned SrcIdx = M / Size;
5842	SDValue V = M < Size ? V1 : V2;
5843	M %= Size;
5844
5845	// We are referencing an UNDEF input.
5846	if (V.isUndef()) {
5847	Mask[i] = SM_SentinelUndef;
5848	continue;
5849	}
5850
5851	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5852	// TODO: We currently only set UNDEF for integer types - floats use the same
5853	// registers as vectors and many of the scalar folded loads rely on the
5854	// SCALAR_TO_VECTOR pattern.
5855	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5856	(Size % V.getValueType().getVectorNumElements()) == 0) {
5857	int Scale = Size / V.getValueType().getVectorNumElements();
5858	int Idx = M / Scale;
5859	if (Idx != 0 && !VT.isFloatingPoint())
5860	Mask[i] = SM_SentinelUndef;
5861	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5862	Mask[i] = SM_SentinelZero;
5863	continue;
5864	}
5865
5866	// Attempt to extract from the source's constant bits.
5867	if (IsSrcConstant[SrcIdx]) {
5868	if (UndefSrcElts[SrcIdx][M])
5869	Mask[i] = SM_SentinelUndef;
5870	else if (SrcEltBits[SrcIdx][M] == 0)
5871	Mask[i] = SM_SentinelZero;
5872	}
5873	}
5874
5875	assert(VT.getVectorNumElements() == Mask.size() &&((VT.getVectorNumElements() == Mask.size() && "Different mask size from vector size!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5876, __PRETTY_FUNCTION__))
5876	"Different mask size from vector size!")((VT.getVectorNumElements() == Mask.size() && "Different mask size from vector size!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5876, __PRETTY_FUNCTION__));
5877	return true;
5878	}
5879
5880	// Attempt to decode ops that could be represented as a shuffle mask.
5881	// The decoded shuffle mask may contain a different number of elements to the
5882	// destination value type.
5883	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5884	SmallVectorImpl<SDValue> &Ops,
5885	SelectionDAG &DAG) {
5886	Mask.clear();
5887	Ops.clear();
5888
5889	MVT VT = N.getSimpleValueType();
5890	unsigned NumElts = VT.getVectorNumElements();
5891	unsigned NumSizeInBits = VT.getSizeInBits();
5892	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5893	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&(((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && "Expected byte aligned value types") ? static_cast <void> (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5894, __PRETTY_FUNCTION__))
5894	"Expected byte aligned value types")(((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && "Expected byte aligned value types") ? static_cast <void> (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5894, __PRETTY_FUNCTION__));
5895
5896	unsigned Opcode = N.getOpcode();
5897	switch (Opcode) {
5898	case ISD::AND:
5899	case X86ISD::ANDNP: {
5900	// Attempt to decode as a per-byte mask.
5901	APInt UndefElts;
5902	SmallVector<APInt, 32> EltBits;
5903	SDValue N0 = N.getOperand(0);
5904	SDValue N1 = N.getOperand(1);
5905	bool IsAndN = (X86ISD::ANDNP == Opcode);
5906	uint64_t ZeroMask = IsAndN ? 255 : 0;
5907	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5908	return false;
5909	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5910	if (UndefElts[i]) {
5911	Mask.push_back(SM_SentinelUndef);
5912	continue;
5913	}
5914	uint64_t ByteBits = EltBits[i].getZExtValue();
5915	if (ByteBits != 0 && ByteBits != 255)
5916	return false;
5917	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5918	}
5919	Ops.push_back(IsAndN ? N1 : N0);
5920	return true;
5921	}
5922	case ISD::SCALAR_TO_VECTOR: {
5923	// Match against a scalar_to_vector of an extract from a vector,
5924	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5925	SDValue N0 = N.getOperand(0);
5926	SDValue SrcExtract;
5927
5928	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5929	N0.getOperand(0).getValueType() == VT) {
5930	SrcExtract = N0;
5931	} else if (N0.getOpcode() == ISD::AssertZext &&
5932	N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5933	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5934	SrcExtract = N0.getOperand(0);
5935	assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16)((SrcExtract.getOperand(0).getValueType() == MVT::v8i16) ? static_cast <void> (0) : __assert_fail ("SrcExtract.getOperand(0).getValueType() == MVT::v8i16" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5935, __PRETTY_FUNCTION__));
5936	} else if (N0.getOpcode() == ISD::AssertZext &&
5937	N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5938	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5939	SrcExtract = N0.getOperand(0);
5940	assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8)((SrcExtract.getOperand(0).getValueType() == MVT::v16i8) ? static_cast <void> (0) : __assert_fail ("SrcExtract.getOperand(0).getValueType() == MVT::v16i8" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5940, __PRETTY_FUNCTION__));
5941	}
5942
5943	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5944	return false;
5945
5946	SDValue SrcVec = SrcExtract.getOperand(0);
5947	EVT SrcVT = SrcVec.getValueType();
5948	unsigned NumSrcElts = SrcVT.getVectorNumElements();
5949	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5950
5951	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5952	if (NumSrcElts <= SrcIdx)
5953	return false;
5954
5955	Ops.push_back(SrcVec);
5956	Mask.push_back(SrcIdx);
5957	Mask.append(NumZeros, SM_SentinelZero);
5958	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5959	return true;
5960	}
5961	case X86ISD::PINSRB:
5962	case X86ISD::PINSRW: {
5963	SDValue InVec = N.getOperand(0);
5964	SDValue InScl = N.getOperand(1);
5965	uint64_t InIdx = N.getConstantOperandVal(2);
5966	assert(InIdx < NumElts && "Illegal insertion index")((InIdx < NumElts && "Illegal insertion index") ? static_cast <void> (0) : __assert_fail ("InIdx < NumElts && \"Illegal insertion index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5966, __PRETTY_FUNCTION__));
5967
5968	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5969	if (X86::isZeroNode(InScl)) {
5970	Ops.push_back(InVec);
5971	for (unsigned i = 0; i != NumElts; ++i)
5972	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5973	return true;
5974	}
5975
5976	// Attempt to recognise a PINSR(ASSERTZEXT(PEXTR)) shuffle pattern.
5977	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5978	unsigned ExOp =
5979	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5980	if (InScl.getOpcode() != ISD::AssertZext \|\|
5981	InScl.getOperand(0).getOpcode() != ExOp)
5982	return false;
5983
5984	SDValue ExVec = InScl.getOperand(0).getOperand(0);
5985	uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5986	assert(ExIdx < NumElts && "Illegal extraction index")((ExIdx < NumElts && "Illegal extraction index") ? static_cast<void> (0) : __assert_fail ("ExIdx < NumElts && \"Illegal extraction index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 5986, __PRETTY_FUNCTION__));
5987	Ops.push_back(InVec);
5988	Ops.push_back(ExVec);
5989	for (unsigned i = 0; i != NumElts; ++i)
5990	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5991	return true;
5992	}
5993	case X86ISD::PACKSS: {
5994	// If we know input saturation won't happen we can treat this
5995	// as a truncation shuffle.
5996	if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt \|\|
5997	DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
5998	return false;
5999
6000	Ops.push_back(N.getOperand(0));
6001	Ops.push_back(N.getOperand(1));
6002	for (unsigned i = 0; i != NumElts; ++i)
6003	Mask.push_back(i * 2);
6004	return true;
6005	}
6006	case X86ISD::VSHLI:
6007	case X86ISD::VSRLI: {
6008	uint64_t ShiftVal = N.getConstantOperandVal(1);
6009	// Out of range bit shifts are guaranteed to be zero.
6010	if (NumBitsPerElt <= ShiftVal) {
6011	Mask.append(NumElts, SM_SentinelZero);
6012	return true;
6013	}
6014
6015	// We can only decode 'whole byte' bit shifts as shuffles.
6016	if ((ShiftVal % 8) != 0)
6017	break;
6018
6019	uint64_t ByteShift = ShiftVal / 8;
6020	unsigned NumBytes = NumSizeInBits / 8;
6021	unsigned NumBytesPerElt = NumBitsPerElt / 8;
6022	Ops.push_back(N.getOperand(0));
6023
6024	// Clear mask to all zeros and insert the shifted byte indices.
6025	Mask.append(NumBytes, SM_SentinelZero);
6026
6027	if (X86ISD::VSHLI == Opcode) {
6028	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6029	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6030	Mask[i + j] = i + j - ByteShift;
6031	} else {
6032	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6033	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6034	Mask[i + j - ByteShift] = i + j;
6035	}
6036	return true;
6037	}
6038	case ISD::ZERO_EXTEND_VECTOR_INREG:
6039	case X86ISD::VZEXT: {
6040	// TODO - add support for VPMOVZX with smaller input vector types.
6041	SDValue Src = N.getOperand(0);
6042	MVT SrcVT = Src.getSimpleValueType();
6043	if (NumSizeInBits != SrcVT.getSizeInBits())
6044	break;
6045	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6046	Ops.push_back(Src);
6047	return true;
6048	}
6049	}
6050
6051	return false;
6052	}
6053
6054	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6055	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6056	SmallVectorImpl<int> &Mask) {
6057	int MaskWidth = Mask.size();
6058	SmallVector<SDValue, 16> UsedInputs;
6059	for (int i = 0, e = Inputs.size(); i < e; ++i) {
6060	int lo = UsedInputs.size() * MaskWidth;
6061	int hi = lo + MaskWidth;
6062	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6063	UsedInputs.push_back(Inputs[i]);
6064	continue;
6065	}
6066	for (int &M : Mask)
6067	if (lo <= M)
6068	M -= MaskWidth;
6069	}
6070	Inputs = UsedInputs;
6071	}
6072
6073	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6074	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6075	/// remaining input indices in case we now have a unary shuffle and adjust the
6076	/// inputs accordingly.
6077	/// Returns true if the target shuffle mask was decoded.
6078	static bool resolveTargetShuffleInputs(SDValue Op,
6079	SmallVectorImpl<SDValue> &Inputs,
6080	SmallVectorImpl<int> &Mask,
6081	SelectionDAG &DAG) {
6082	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6083	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6084	return false;
6085
6086	resolveTargetShuffleInputsAndMask(Inputs, Mask);
6087	return true;
6088	}
6089
6090	/// Returns the scalar element that will make up the ith
6091	/// element of the result of the vector shuffle.
6092	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6093	unsigned Depth) {
6094	if (Depth == 6)
6095	return SDValue(); // Limit search depth.
6096
6097	SDValue V = SDValue(N, 0);
6098	EVT VT = V.getValueType();
6099	unsigned Opcode = V.getOpcode();
6100
6101	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6102	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6103	int Elt = SV->getMaskElt(Index);
6104
6105	if (Elt < 0)
6106	return DAG.getUNDEF(VT.getVectorElementType());
6107
6108	unsigned NumElems = VT.getVectorNumElements();
6109	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6110	: SV->getOperand(1);
6111	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6112	}
6113
6114	// Recurse into target specific vector shuffles to find scalars.
6115	if (isTargetShuffle(Opcode)) {
6116	MVT ShufVT = V.getSimpleValueType();
6117	MVT ShufSVT = ShufVT.getVectorElementType();
6118	int NumElems = (int)ShufVT.getVectorNumElements();
6119	SmallVector<int, 16> ShuffleMask;
6120	SmallVector<SDValue, 16> ShuffleOps;
6121	bool IsUnary;
6122
6123	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6124	return SDValue();
6125
6126	int Elt = ShuffleMask[Index];
6127	if (Elt == SM_SentinelZero)
6128	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6129	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6130	if (Elt == SM_SentinelUndef)
6131	return DAG.getUNDEF(ShufSVT);
6132
6133	assert(0 <= Elt && Elt < (2NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2NumElems) && "Shuffle index out of range" ) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6133, __PRETTY_FUNCTION__));
6134	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6135	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6136	Depth+1);
6137	}
6138
6139	// Actual nodes that may contain scalar elements
6140	if (Opcode == ISD::BITCAST) {
6141	V = V.getOperand(0);
6142	EVT SrcVT = V.getValueType();
6143	unsigned NumElems = VT.getVectorNumElements();
6144
6145	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
6146	return SDValue();
6147	}
6148
6149	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6150	return (Index == 0) ? V.getOperand(0)
6151	: DAG.getUNDEF(VT.getVectorElementType());
6152
6153	if (V.getOpcode() == ISD::BUILD_VECTOR)
6154	return V.getOperand(Index);
6155
6156	return SDValue();
6157	}
6158
6159	/// Custom lower build_vector of v16i8.
6160	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6161	unsigned NumNonZero, unsigned NumZero,
6162	SelectionDAG &DAG,
6163	const X86Subtarget &Subtarget) {
6164	if (NumNonZero > 8 && !Subtarget.hasSSE41())
6165	return SDValue();
6166
6167	SDLoc dl(Op);
6168	SDValue V;
6169	bool First = true;
6170
6171	// SSE4.1 - use PINSRB to insert each byte directly.
6172	if (Subtarget.hasSSE41()) {
6173	for (unsigned i = 0; i < 16; ++i) {
6174	bool IsNonZero = (NonZeros & (1 << i)) != 0;
6175	if (IsNonZero) {
6176	// If the build vector contains zeros or our first insertion is not the
6177	// first index then insert into zero vector to break any register
6178	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6179	if (First) {
6180	First = false;
6181	if (NumZero \|\| 0 != i)
6182	V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6183	else {
6184	assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast <void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6184, __PRETTY_FUNCTION__));
6185	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6186	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6187	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6188	V = DAG.getBitcast(MVT::v16i8, V);
6189	continue;
6190	}
6191	}
6192	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6193	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6194	}
6195	}
6196
6197	return V;
6198	}
6199
6200	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6201	for (unsigned i = 0; i < 16; ++i) {
6202	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6203	if (ThisIsNonZero && First) {
6204	if (NumZero)
6205	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6206	else
6207	V = DAG.getUNDEF(MVT::v8i16);
6208	First = false;
6209	}
6210
6211	if ((i & 1) != 0) {
6212	// FIXME: Investigate extending to i32 instead of just i16.
6213	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
6214	SDValue ThisElt, LastElt;
6215	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6216	if (LastIsNonZero) {
6217	LastElt =
6218	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6219	}
6220	if (ThisIsNonZero) {
6221	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6222	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6223	DAG.getConstant(8, dl, MVT::i8));
6224	if (LastIsNonZero)
6225	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6226	} else
6227	ThisElt = LastElt;
6228
6229	if (ThisElt) {
6230	if (1 == i) {
6231	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6232	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6233	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6234	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6235	V = DAG.getBitcast(MVT::v8i16, V);
6236	} else {
6237	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6238	DAG.getIntPtrConstant(i / 2, dl));
6239	}
6240	}
6241	}
6242	}
6243
6244	return DAG.getBitcast(MVT::v16i8, V);
6245	}
6246
6247	/// Custom lower build_vector of v8i16.
6248	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6249	unsigned NumNonZero, unsigned NumZero,
6250	SelectionDAG &DAG,
6251	const X86Subtarget &Subtarget) {
6252	if (NumNonZero > 4 && !Subtarget.hasSSE41())
6253	return SDValue();
6254
6255	SDLoc dl(Op);
6256	SDValue V;
6257	bool First = true;
6258	for (unsigned i = 0; i < 8; ++i) {
6259	bool IsNonZero = (NonZeros & (1 << i)) != 0;
6260	if (IsNonZero) {
6261	// If the build vector contains zeros or our first insertion is not the
6262	// first index then insert into zero vector to break any register
6263	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6264	if (First) {
6265	First = false;
6266	if (NumZero \|\| 0 != i)
6267	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6268	else {
6269	assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast <void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6269, __PRETTY_FUNCTION__));
6270	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6271	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6272	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6273	V = DAG.getBitcast(MVT::v8i16, V);
6274	continue;
6275	}
6276	}
6277	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6278	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6279	}
6280	}
6281
6282	return V;
6283	}
6284
6285	/// Custom lower build_vector of v4i32 or v4f32.
6286	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6287	const X86Subtarget &Subtarget) {
6288	// Find all zeroable elements.
6289	std::bitset<4> Zeroable;
6290	for (int i=0; i < 4; ++i) {
6291	SDValue Elt = Op->getOperand(i);
6292	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
6293	}
6294	assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!" ) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6295, __PRETTY_FUNCTION__))
6295	"We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!" ) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6295, __PRETTY_FUNCTION__));
6296
6297	// We only know how to deal with build_vector nodes where elements are either
6298	// zeroable or extract_vector_elt with constant index.
6299	SDValue FirstNonZero;
6300	unsigned FirstNonZeroIdx;
6301	for (unsigned i=0; i < 4; ++i) {
6302	if (Zeroable[i])
6303	continue;
6304	SDValue Elt = Op->getOperand(i);
6305	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
6306	!isa<ConstantSDNode>(Elt.getOperand(1)))
6307	return SDValue();
6308	// Make sure that this node is extracting from a 128-bit vector.
6309	MVT VT = Elt.getOperand(0).getSimpleValueType();
6310	if (!VT.is128BitVector())
6311	return SDValue();
6312	if (!FirstNonZero.getNode()) {
6313	FirstNonZero = Elt;
6314	FirstNonZeroIdx = i;
6315	}
6316	}
6317
6318	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!" ) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6318, __PRETTY_FUNCTION__));
6319	SDValue V1 = FirstNonZero.getOperand(0);
6320	MVT VT = V1.getSimpleValueType();
6321
6322	// See if this build_vector can be lowered as a blend with zero.
6323	SDValue Elt;
6324	unsigned EltMaskIdx, EltIdx;
6325	int Mask[4];
6326	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6327	if (Zeroable[EltIdx]) {
6328	// The zero vector will be on the right hand side.
6329	Mask[EltIdx] = EltIdx+4;
6330	continue;
6331	}
6332
6333	Elt = Op->getOperand(EltIdx);
6334	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6335	EltMaskIdx = Elt.getConstantOperandVal(1);
6336	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
6337	break;
6338	Mask[EltIdx] = EltIdx;
6339	}
6340
6341	if (EltIdx == 4) {
6342	// Let the shuffle legalizer deal with blend operations.
6343	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6344	if (V1.getSimpleValueType() != VT)
6345	V1 = DAG.getBitcast(VT, V1);
6346	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6347	}
6348
6349	// See if we can lower this build_vector to a INSERTPS.
6350	if (!Subtarget.hasSSE41())
6351	return SDValue();
6352
6353	SDValue V2 = Elt.getOperand(0);
6354	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6355	V1 = SDValue();
6356
6357	bool CanFold = true;
6358	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6359	if (Zeroable[i])
6360	continue;
6361
6362	SDValue Current = Op->getOperand(i);
6363	SDValue SrcVector = Current->getOperand(0);
6364	if (!V1.getNode())
6365	V1 = SrcVector;
6366	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6367	}
6368
6369	if (!CanFold)
6370	return SDValue();
6371
6372	assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!" ) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6372, __PRETTY_FUNCTION__));
6373	if (V1.getSimpleValueType() != MVT::v4f32)
6374	V1 = DAG.getBitcast(MVT::v4f32, V1);
6375	if (V2.getSimpleValueType() != MVT::v4f32)
6376	V2 = DAG.getBitcast(MVT::v4f32, V2);
6377
6378	// Ok, we can emit an INSERTPS instruction.
6379	unsigned ZMask = Zeroable.to_ulong();
6380
6381	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
6382	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!" ) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6382, __PRETTY_FUNCTION__));
6383	SDLoc DL(Op);
6384	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6385	DAG.getIntPtrConstant(InsertPSMask, DL));
6386	return DAG.getBitcast(VT, Result);
6387	}
6388
6389	/// Return a vector logical shift node.
6390	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6391	SelectionDAG &DAG, const TargetLowering &TLI,
6392	const SDLoc &dl) {
6393	assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6393, __PRETTY_FUNCTION__));
6394	MVT ShVT = MVT::v16i8;
6395	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6396	SrcOp = DAG.getBitcast(ShVT, SrcOp);
6397	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6398	assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts" ) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6398, __PRETTY_FUNCTION__));
6399	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6400	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6401	}
6402
6403	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6404	SelectionDAG &DAG) {
6405
6406	// Check if the scalar load can be widened into a vector load. And if
6407	// the address is "base + cst" see if the cst can be "absorbed" into
6408	// the shuffle mask.
6409	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6410	SDValue Ptr = LD->getBasePtr();
6411	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
6412	return SDValue();
6413	EVT PVT = LD->getValueType(0);
6414	if (PVT != MVT::i32 && PVT != MVT::f32)
6415	return SDValue();
6416
6417	int FI = -1;
6418	int64_t Offset = 0;
6419	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6420	FI = FINode->getIndex();
6421	Offset = 0;
6422	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
6423	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6424	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6425	Offset = Ptr.getConstantOperandVal(1);
6426	Ptr = Ptr.getOperand(0);
6427	} else {
6428	return SDValue();
6429	}
6430
6431	// FIXME: 256-bit vector instructions don't require a strict alignment,
6432	// improve this code to support it better.
6433	unsigned RequiredAlign = VT.getSizeInBits()/8;
6434	SDValue Chain = LD->getChain();
6435	// Make sure the stack object alignment is at least 16 or 32.
6436	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6437	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6438	if (MFI.isFixedObjectIndex(FI)) {
6439	// Can't change the alignment. FIXME: It's possible to compute
6440	// the exact stack offset and reference FI + adjust offset instead.
6441	// If someone really cares about this. That's the way to implement it.
6442	return SDValue();
6443	} else {
6444	MFI.setObjectAlignment(FI, RequiredAlign);
6445	}
6446	}
6447
6448	// (Offset % 16 or 32) must be multiple of 4. Then address is then
6449	// Ptr + (Offset & ~15).
6450	if (Offset < 0)
6451	return SDValue();
6452	if ((Offset % RequiredAlign) & 3)
6453	return SDValue();
6454	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6455	if (StartOffset) {
6456	SDLoc DL(Ptr);
6457	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6458	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6459	}
6460
6461	int EltNo = (Offset - StartOffset) >> 2;
6462	unsigned NumElems = VT.getVectorNumElements();
6463
6464	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6465	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6466	LD->getPointerInfo().getWithOffset(StartOffset));
6467
6468	SmallVector<int, 8> Mask(NumElems, EltNo);
6469
6470	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6471	}
6472
6473	return SDValue();
6474	}
6475
6476	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6477	/// elements can be replaced by a single large load which has the same value as
6478	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6479	///
6480	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
6481	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6482	const SDLoc &DL, SelectionDAG &DAG,
6483	const X86Subtarget &Subtarget,
6484	bool isAfterLegalize) {
6485	unsigned NumElems = Elts.size();
6486
6487	int LastLoadedElt = -1;
6488	SmallBitVector LoadMask(NumElems, false);
6489	SmallBitVector ZeroMask(NumElems, false);
6490	SmallBitVector UndefMask(NumElems, false);
6491
6492	// For each element in the initializer, see if we've found a load, zero or an
6493	// undef.
6494	for (unsigned i = 0; i < NumElems; ++i) {
6495	SDValue Elt = peekThroughBitcasts(Elts[i]);
6496	if (!Elt.getNode())
6497	return SDValue();
6498
6499	if (Elt.isUndef())
6500	UndefMask[i] = true;
6501	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
6502	ZeroMask[i] = true;
6503	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6504	LoadMask[i] = true;
6505	LastLoadedElt = i;
6506	// Each loaded element must be the correct fractional portion of the
6507	// requested vector load.
6508	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6509	return SDValue();
6510	} else
6511	return SDValue();
6512	}
6513	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&(((ZeroMask \| UndefMask \| LoadMask).count() == NumElems && "Incomplete element masks") ? static_cast<void> (0) : __assert_fail ("(ZeroMask \| UndefMask \| LoadMask).count() == NumElems && \"Incomplete element masks\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6514, __PRETTY_FUNCTION__))
6514	"Incomplete element masks")(((ZeroMask \| UndefMask \| LoadMask).count() == NumElems && "Incomplete element masks") ? static_cast<void> (0) : __assert_fail ("(ZeroMask \| UndefMask \| LoadMask).count() == NumElems && \"Incomplete element masks\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6514, __PRETTY_FUNCTION__));
6515
6516	// Handle Special Cases - all undef or undef/zero.
6517	if (UndefMask.count() == NumElems)
6518	return DAG.getUNDEF(VT);
6519
6520	// FIXME: Should we return this as a BUILD_VECTOR instead?
6521	if ((ZeroMask \| UndefMask).count() == NumElems)
6522	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6523	: DAG.getConstantFP(0.0, DL, VT);
6524
6525	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6526	int FirstLoadedElt = LoadMask.find_first();
6527	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6528	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6529	EVT LDBaseVT = EltBase.getValueType();
6530
6531	// Consecutive loads can contain UNDEFS but not ZERO elements.
6532	// Consecutive loads with UNDEFs and ZEROs elements require a
6533	// an additional shuffle stage to clear the ZERO elements.
6534	bool IsConsecutiveLoad = true;
6535	bool IsConsecutiveLoadWithZeros = true;
6536	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6537	if (LoadMask[i]) {
6538	SDValue Elt = peekThroughBitcasts(Elts[i]);
6539	LoadSDNode *LD = cast<LoadSDNode>(Elt);
6540	if (!DAG.areNonVolatileConsecutiveLoads(
6541	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6542	i - FirstLoadedElt)) {
6543	IsConsecutiveLoad = false;
6544	IsConsecutiveLoadWithZeros = false;
6545	break;
6546	}
6547	} else if (ZeroMask[i]) {
6548	IsConsecutiveLoad = false;
6549	}
6550	}
6551
6552	auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6553	auto MMOFlags = LDBase->getMemOperand()->getFlags();
6554	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&((!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads." ) ? static_cast<void> (0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6555, __PRETTY_FUNCTION__))
6555	"Cannot merge volatile loads.")((!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads." ) ? static_cast<void> (0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6555, __PRETTY_FUNCTION__));
6556	SDValue NewLd =
6557	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6558	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6559	DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6560	return NewLd;
6561	};
6562
6563	// LOAD - all consecutive load/undefs (must start/end with a load).
6564	// If we have found an entire vector of loads and undefs, then return a large
6565	// load of the entire vector width starting at the base pointer.
6566	// If the vector contains zeros, then attempt to shuffle those elements.
6567	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6568	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
6569	assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads" ) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6569, __PRETTY_FUNCTION__));
6570	EVT EltVT = LDBase->getValueType(0);
6571	// Ensure that the input vector size for the merged loads matches the
6572	// cumulative size of the input elements.
6573	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6574	return SDValue();
6575
6576	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6577	return SDValue();
6578
6579	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
6580	// will lower to regular temporal loads and use the cache.
6581	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6582	VT.is256BitVector() && !Subtarget.hasInt256())
6583	return SDValue();
6584
6585	if (IsConsecutiveLoad)
6586	return CreateLoad(VT, LDBase);
6587
6588	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6589	// vector and a zero vector to clear out the zero elements.
6590	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6591	SmallVector<int, 4> ClearMask(NumElems, -1);
6592	for (unsigned i = 0; i < NumElems; ++i) {
6593	if (ZeroMask[i])
6594	ClearMask[i] = i + NumElems;
6595	else if (LoadMask[i])
6596	ClearMask[i] = i;
6597	}
6598	SDValue V = CreateLoad(VT, LDBase);
6599	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6600	: DAG.getConstantFP(0.0, DL, VT);
6601	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6602	}
6603	}
6604
6605	int LoadSize =
6606	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6607
6608	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6609	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6610	(LoadSize == 32 \|\| LoadSize == 64) &&
6611	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
6612	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6613	: MVT::getIntegerVT(LoadSize);
6614	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6615	if (TLI.isTypeLegal(VecVT)) {
6616	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6617	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6618	SDValue ResNode =
6619	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6620	LDBase->getPointerInfo(),
6621	LDBase->getAlignment(),
6622	false/isVolatile/, true/ReadMem/,
6623	false/WriteMem/);
6624	DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6625	return DAG.getBitcast(VT, ResNode);
6626	}
6627	}
6628
6629	return SDValue();
6630	}
6631
6632	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6633	unsigned SplatBitSize, LLVMContext &C) {
6634	unsigned ScalarSize = VT.getScalarSizeInBits();
6635	unsigned NumElm = SplatBitSize / ScalarSize;
6636
6637	SmallVector<Constant *, 32> ConstantVec;
6638	for (unsigned i = 0; i < NumElm; i++) {
6639	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6640	Constant *Const;
6641	if (VT.isFloatingPoint()) {
6642	if (ScalarSize == 32) {
6643	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6644	} else {
6645	assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size" ) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6645, __PRETTY_FUNCTION__));
6646	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6647	}
6648	} else
6649	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6650	ConstantVec.push_back(Const);
6651	}
6652	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6653	}
6654
6655	static bool isUseOfShuffle(SDNode *N) {
6656	for (auto *U : N->uses()) {
6657	if (isTargetShuffle(U->getOpcode()))
6658	return true;
6659	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6660	return isUseOfShuffle(U);
6661	}
6662	return false;
6663	}
6664
6665	/// Attempt to use the vbroadcast instruction to generate a splat value
6666	/// from a splat BUILD_VECTOR which uses:
6667	/// a. A single scalar load, or a constant.
6668	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6669	///
6670	/// The VBROADCAST node is returned when a pattern is found,
6671	/// or SDValue() otherwise.
6672	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6673	const X86Subtarget &Subtarget,
6674	SelectionDAG &DAG) {
6675	// VBROADCAST requires AVX.
6676	// TODO: Splats could be generated for non-AVX CPUs using SSE
6677	// instructions, but there's less potential gain for only 128-bit vectors.
6678	if (!Subtarget.hasAVX())
6679	return SDValue();
6680
6681	MVT VT = BVOp->getSimpleValueType(0);
6682	SDLoc dl(BVOp);
6683
6684	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Unsupported vector type for broadcast.") ? static_cast <void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Unsupported vector type for broadcast.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6685, __PRETTY_FUNCTION__))
6685	"Unsupported vector type for broadcast.")(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Unsupported vector type for broadcast.") ? static_cast <void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Unsupported vector type for broadcast.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6685, __PRETTY_FUNCTION__));
6686
6687	BitVector UndefElements;
6688	SDValue Ld = BVOp->getSplatValue(&UndefElements);
6689
6690	// We need a splat of a single value to use broadcast, and it doesn't
6691	// make any sense if the value is only in one element of the vector.
6692	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6693	APInt SplatValue, Undef;
6694	unsigned SplatBitSize;
6695	bool HasUndef;
6696	// Check if this is a repeated constant pattern suitable for broadcasting.
6697	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6698	SplatBitSize > VT.getScalarSizeInBits() &&
6699	SplatBitSize < VT.getSizeInBits()) {
6700	// Avoid replacing with broadcast when it's a use of a shuffle
6701	// instruction to preserve the present custom lowering of shuffles.
6702	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
6703	return SDValue();
6704	// replace BUILD_VECTOR with broadcast of the repeated constants.
6705	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6706	LLVMContext *Ctx = DAG.getContext();
6707	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6708	if (Subtarget.hasAVX()) {
6709	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6710	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
6711	// Splatted value can fit in one INTEGER constant in constant pool.
6712	// Load the constant and broadcast it.
6713	MVT CVT = MVT::getIntegerVT(SplatBitSize);
6714	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
6715	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6716	SDValue CP = DAG.getConstantPool(C, PVT);
6717	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6718
6719	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6720	Ld = DAG.getLoad(
6721	CVT, dl, DAG.getEntryNode(), CP,
6722	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6723	Alignment);
6724	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6725	MVT::getVectorVT(CVT, Repeat), Ld);
6726	return DAG.getBitcast(VT, Brdcst);
6727	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
6728	// Splatted value can fit in one FLOAT constant in constant pool.
6729	// Load the constant and broadcast it.
6730	// AVX have support for 32 and 64 bit broadcast for floats only.
6731	// No 64bit integer in 32bit subtarget.
6732	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6733	// Lower the splat via APFloat directly, to avoid any conversion.
6734	Constant *C =
6735	SplatBitSize == 32
6736	? ConstantFP::get(*Ctx,
6737	APFloat(APFloat::IEEEsingle(), SplatValue))
6738	: ConstantFP::get(*Ctx,
6739	APFloat(APFloat::IEEEdouble(), SplatValue));
6740	SDValue CP = DAG.getConstantPool(C, PVT);
6741	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6742
6743	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6744	Ld = DAG.getLoad(
6745	CVT, dl, DAG.getEntryNode(), CP,
6746	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6747	Alignment);
6748	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6749	MVT::getVectorVT(CVT, Repeat), Ld);
6750	return DAG.getBitcast(VT, Brdcst);
6751	} else if (SplatBitSize > 64) {
6752	// Load the vector of constants and broadcast it.
6753	MVT CVT = VT.getScalarType();
6754	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6755	*Ctx);
6756	SDValue VCP = DAG.getConstantPool(VecC, PVT);
6757	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6758	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6759	Ld = DAG.getLoad(
6760	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6761	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6762	Alignment);
6763	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6764	return DAG.getBitcast(VT, Brdcst);
6765	}
6766	}
6767	}
6768	return SDValue();
6769	}
6770
6771	bool ConstSplatVal =
6772	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);
6773
6774	// Make sure that all of the users of a non-constant load are from the
6775	// BUILD_VECTOR node.
6776	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6777	return SDValue();
6778
6779	unsigned ScalarSize = Ld.getValueSizeInBits();
6780	bool IsGE256 = (VT.getSizeInBits() >= 256);
6781
6782	// When optimizing for size, generate up to 5 extra bytes for a broadcast
6783	// instruction to save 8 or more bytes of constant pool data.
6784	// TODO: If multiple splats are generated to load the same constant,
6785	// it may be detrimental to overall size. There needs to be a way to detect
6786	// that condition to know if this is truly a size win.
6787	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6788
6789	// Handle broadcasting a single constant scalar from the constant pool
6790	// into a vector.
6791	// On Sandybridge (no AVX2), it is still better to load a constant vector
6792	// from the constant pool and not to broadcast it from a scalar.
6793	// But override that restriction when optimizing for size.
6794	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
6795	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
6796	EVT CVT = Ld.getValueType();
6797	assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type" ) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6797, __PRETTY_FUNCTION__));
6798
6799	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6800	// For size optimization, also splat v2f64 and v2i64, and for size opt
6801	// with AVX2, also splat i8 and i16.
6802	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6803	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
6804	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
6805	const Constant *C = nullptr;
6806	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6807	C = CI->getConstantIntValue();
6808	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6809	C = CF->getConstantFPValue();
6810
6811	assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void > (0) : __assert_fail ("C && \"Invalid constant type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6811, __PRETTY_FUNCTION__));
6812
6813	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6814	SDValue CP =
6815	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6816	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6817	Ld = DAG.getLoad(
6818	CVT, dl, DAG.getEntryNode(), CP,
6819	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6820	Alignment);
6821
6822	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6823	}
6824	}
6825
6826	bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6827
6828	// Handle AVX2 in-register broadcasts.
6829	if (!IsLoad && Subtarget.hasInt256() &&
6830	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
6831	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6832
6833	// The scalar source must be a normal load.
6834	if (!IsLoad)
6835	return SDValue();
6836
6837	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
6838	(Subtarget.hasVLX() && ScalarSize == 64))
6839	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6840
6841	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
6842	// double since there is no vbroadcastsd xmm
6843	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6844	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
6845	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6846	}
6847
6848	// Unsupported broadcast.
6849	return SDValue();
6850	}
6851
6852	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6853	/// underlying vector and index.
6854	///
6855	/// Modifies \p ExtractedFromVec to the real vector and returns the real
6856	/// index.
6857	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6858	SDValue ExtIdx) {
6859	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6860	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6861	return Idx;
6862
6863	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6864	// lowered this:
6865	// (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6866	// to:
6867	// (extract_vector_elt (vector_shuffle<2,u,u,u>
6868	// (extract_subvector (v8f32 %vreg0), Constant<4>),
6869	// undef)
6870	// Constant<0>)
6871	// In this case the vector is the extract_subvector expression and the index
6872	// is 2, as specified by the shuffle.
6873	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6874	SDValue ShuffleVec = SVOp->getOperand(0);
6875	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6876	assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType ().getVectorElementType()) ? static_cast<void> (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6877, __PRETTY_FUNCTION__))
6877	ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType ().getVectorElementType()) ? static_cast<void> (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6877, __PRETTY_FUNCTION__));
6878
6879	int ShuffleIdx = SVOp->getMaskElt(Idx);
6880	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6881	ExtractedFromVec = ShuffleVec;
6882	return ShuffleIdx;
6883	}
6884	return Idx;
6885	}
6886
6887	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6888	MVT VT = Op.getSimpleValueType();
6889
6890	// Skip if insert_vec_elt is not supported.
6891	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6892	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6893	return SDValue();
6894
6895	SDLoc DL(Op);
6896	unsigned NumElems = Op.getNumOperands();
6897
6898	SDValue VecIn1;
6899	SDValue VecIn2;
6900	SmallVector<unsigned, 4> InsertIndices;
6901	SmallVector<int, 8> Mask(NumElems, -1);
6902
6903	for (unsigned i = 0; i != NumElems; ++i) {
6904	unsigned Opc = Op.getOperand(i).getOpcode();
6905
6906	if (Opc == ISD::UNDEF)
6907	continue;
6908
6909	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6910	// Quit if more than 1 elements need inserting.
6911	if (InsertIndices.size() > 1)
6912	return SDValue();
6913
6914	InsertIndices.push_back(i);
6915	continue;
6916	}
6917
6918	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6919	SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6920
6921	// Quit if non-constant index.
6922	if (!isa<ConstantSDNode>(ExtIdx))
6923	return SDValue();
6924	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6925
6926	// Quit if extracted from vector of different type.
6927	if (ExtractedFromVec.getValueType() != VT)
6928	return SDValue();
6929
6930	if (!VecIn1.getNode())
6931	VecIn1 = ExtractedFromVec;
6932	else if (VecIn1 != ExtractedFromVec) {
6933	if (!VecIn2.getNode())
6934	VecIn2 = ExtractedFromVec;
6935	else if (VecIn2 != ExtractedFromVec)
6936	// Quit if more than 2 vectors to shuffle
6937	return SDValue();
6938	}
6939
6940	if (ExtractedFromVec == VecIn1)
6941	Mask[i] = Idx;
6942	else if (ExtractedFromVec == VecIn2)
6943	Mask[i] = Idx + NumElems;
6944	}
6945
6946	if (!VecIn1.getNode())
6947	return SDValue();
6948
6949	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6950	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6951
6952	for (unsigned Idx : InsertIndices)
6953	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6954	DAG.getIntPtrConstant(Idx, DL));
6955
6956	return NV;
6957	}
6958
6959	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6960	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector" ) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6962, __PRETTY_FUNCTION__))
6961	Op.getScalarValueSizeInBits() == 1 &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector" ) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6962, __PRETTY_FUNCTION__))
6962	"Can not convert non-constant vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector" ) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6962, __PRETTY_FUNCTION__));
6963	uint64_t Immediate = 0;
6964	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6965	SDValue In = Op.getOperand(idx);
6966	if (!In.isUndef())
6967	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6968	}
6969	SDLoc dl(Op);
6970	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6971	return DAG.getConstant(Immediate, dl, VT);
6972	}
6973	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6974	SDValue
6975	X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6976
6977	MVT VT = Op.getSimpleValueType();
6978	assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!" ) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6979, __PRETTY_FUNCTION__))
6979	"Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!" ) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 6979, __PRETTY_FUNCTION__));
6980
6981	SDLoc dl(Op);
6982	if (ISD::isBuildVectorAllZeros(Op.getNode()))
6983	return DAG.getTargetConstant(0, dl, VT);
6984
6985	if (ISD::isBuildVectorAllOnes(Op.getNode()))
6986	return DAG.getTargetConstant(1, dl, VT);
6987
6988	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6989	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6990	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6991	return DAG.getBitcast(VT, Imm);
6992	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6993	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6994	DAG.getIntPtrConstant(0, dl));
6995	}
6996
6997	// Vector has one or more non-const elements
6998	uint64_t Immediate = 0;
6999	SmallVector<unsigned, 16> NonConstIdx;
7000	bool IsSplat = true;
7001	bool HasConstElts = false;
7002	int SplatIdx = -1;
7003	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7004	SDValue In = Op.getOperand(idx);
7005	if (In.isUndef())
7006	continue;
7007	if (!isa<ConstantSDNode>(In))
7008	NonConstIdx.push_back(idx);
7009	else {
7010	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7011	HasConstElts = true;
7012	}
7013	if (SplatIdx < 0)
7014	SplatIdx = idx;
7015	else if (In != Op.getOperand(SplatIdx))
7016	IsSplat = false;
7017	}
7018
7019	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7020	if (IsSplat)
7021	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7022	DAG.getConstant(1, dl, VT),
7023	DAG.getConstant(0, dl, VT));
7024
7025	// insert elements one by one
7026	SDValue DstVec;
7027	SDValue Imm;
7028	if (Immediate) {
7029	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7030	Imm = DAG.getConstant(Immediate, dl, ImmVT);
7031	}
7032	else if (HasConstElts)
7033	Imm = DAG.getConstant(0, dl, VT);
7034	else
7035	Imm = DAG.getUNDEF(VT);
7036	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7037	DstVec = DAG.getBitcast(VT, Imm);
7038	else {
7039	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7040	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7041	DAG.getIntPtrConstant(0, dl));
7042	}
7043
7044	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7045	unsigned InsertIdx = NonConstIdx[i];
7046	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7047	Op.getOperand(InsertIdx),
7048	DAG.getIntPtrConstant(InsertIdx, dl));
7049	}
7050	return DstVec;
7051	}
7052
7053	/// \brief Return true if \p N implements a horizontal binop and return the
7054	/// operands for the horizontal binop into V0 and V1.
7055	///
7056	/// This is a helper function of LowerToHorizontalOp().
7057	/// This function checks that the build_vector \p N in input implements a
7058	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7059	/// operation to match.
7060	/// For example, if \p Opcode is equal to ISD::ADD, then this function
7061	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7062	/// is equal to ISD::SUB, then this function checks if this is a horizontal
7063	/// arithmetic sub.
7064	///
7065	/// This function only analyzes elements of \p N whose indices are
7066	/// in range [BaseIdx, LastIdx).
7067	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7068	SelectionDAG &DAG,
7069	unsigned BaseIdx, unsigned LastIdx,
7070	SDValue &V0, SDValue &V1) {
7071	EVT VT = N->getValueType(0);
7072
7073	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!" ) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7073, __PRETTY_FUNCTION__));
7074	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!") ? static_cast<void > (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7075, __PRETTY_FUNCTION__))
7075	"Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!") ? static_cast<void > (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7075, __PRETTY_FUNCTION__));
7076
7077	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
7078	bool CanFold = true;
7079	unsigned ExpectedVExtractIdx = BaseIdx;
7080	unsigned NumElts = LastIdx - BaseIdx;
7081	V0 = DAG.getUNDEF(VT);
7082	V1 = DAG.getUNDEF(VT);
7083
7084	// Check if N implements a horizontal binop.
7085	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7086	SDValue Op = N->getOperand(i + BaseIdx);
7087
7088	// Skip UNDEFs.
7089	if (Op->isUndef()) {
7090	// Update the expected vector extract index.
7091	if (i * 2 == NumElts)
7092	ExpectedVExtractIdx = BaseIdx;
7093	ExpectedVExtractIdx += 2;
7094	continue;
7095	}
7096
7097	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7098
7099	if (!CanFold)
7100	break;
7101
7102	SDValue Op0 = Op.getOperand(0);
7103	SDValue Op1 = Op.getOperand(1);
7104
7105	// Try to match the following pattern:
7106	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7107	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7108	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7109	Op0.getOperand(0) == Op1.getOperand(0) &&
7110	isa<ConstantSDNode>(Op0.getOperand(1)) &&
7111	isa<ConstantSDNode>(Op1.getOperand(1)));
7112	if (!CanFold)
7113	break;
7114
7115	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7116	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7117
7118	if (i * 2 < NumElts) {
7119	if (V0.isUndef()) {
7120	V0 = Op0.getOperand(0);
7121	if (V0.getValueType() != VT)
7122	return false;
7123	}
7124	} else {
7125	if (V1.isUndef()) {
7126	V1 = Op0.getOperand(0);
7127	if (V1.getValueType() != VT)
7128	return false;
7129	}
7130	if (i * 2 == NumElts)
7131	ExpectedVExtractIdx = BaseIdx;
7132	}
7133
7134	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7135	if (I0 == ExpectedVExtractIdx)
7136	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7137	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7138	// Try to match the following dag sequence:
7139	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7140	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7141	} else
7142	CanFold = false;
7143
7144	ExpectedVExtractIdx += 2;
7145	}
7146
7147	return CanFold;
7148	}
7149
7150	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7151	/// a concat_vector.
7152	///
7153	/// This is a helper function of LowerToHorizontalOp().
7154	/// This function expects two 256-bit vectors called V0 and V1.
7155	/// At first, each vector is split into two separate 128-bit vectors.
7156	/// Then, the resulting 128-bit vectors are used to implement two
7157	/// horizontal binary operations.
7158	///
7159	/// The kind of horizontal binary operation is defined by \p X86Opcode.
7160	///
7161	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7162	/// the two new horizontal binop.
7163	/// When Mode is set, the first horizontal binop dag node would take as input
7164	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7165	/// horizontal binop dag node would take as input the lower 128-bit of V1
7166	/// and the upper 128-bit of V1.
7167	/// Example:
7168	/// HADD V0_LO, V0_HI
7169	/// HADD V1_LO, V1_HI
7170	///
7171	/// Otherwise, the first horizontal binop dag node takes as input the lower
7172	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7173	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7174	/// Example:
7175	/// HADD V0_LO, V1_LO
7176	/// HADD V0_HI, V1_HI
7177	///
7178	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7179	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7180	/// the upper 128-bits of the result.
7181	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7182	const SDLoc &DL, SelectionDAG &DAG,
7183	unsigned X86Opcode, bool Mode,
7184	bool isUndefLO, bool isUndefHI) {
7185	MVT VT = V0.getSimpleValueType();
7186	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType( ) && "Invalid nodes in input!") ? static_cast<void > (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7187, __PRETTY_FUNCTION__))
7187	"Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType( ) && "Invalid nodes in input!") ? static_cast<void > (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7187, __PRETTY_FUNCTION__));
7188
7189	unsigned NumElts = VT.getVectorNumElements();
7190	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7191	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7192	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7193	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7194	MVT NewVT = V0_LO.getSimpleValueType();
7195
7196	SDValue LO = DAG.getUNDEF(NewVT);
7197	SDValue HI = DAG.getUNDEF(NewVT);
7198
7199	if (Mode) {
7200	// Don't emit a horizontal binop if the result is expected to be UNDEF.
7201	if (!isUndefLO && !V0->isUndef())
7202	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7203	if (!isUndefHI && !V1->isUndef())
7204	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7205	} else {
7206	// Don't emit a horizontal binop if the result is expected to be UNDEF.
7207	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
7208	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7209
7210	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
7211	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7212	}
7213
7214	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7215	}
7216
7217	/// Returns true iff \p BV builds a vector with the result equivalent to
7218	/// the result of ADDSUB operation.
7219	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7220	/// are written to the parameters \p Opnd0 and \p Opnd1.
7221	static bool isAddSub(const BuildVectorSDNode *BV,
7222	const X86Subtarget &Subtarget, SelectionDAG &DAG,
7223	SDValue &Opnd0, SDValue &Opnd1) {
7224
7225	MVT VT = BV->getSimpleValueType(0);
7226	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7227	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7228	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
7229	return false;
7230
7231	unsigned NumElts = VT.getVectorNumElements();
7232	SDValue InVec0 = DAG.getUNDEF(VT);
7233	SDValue InVec1 = DAG.getUNDEF(VT);
7234
7235	// Odd-numbered elements in the input build vector are obtained from
7236	// adding two integer/float elements.
7237	// Even-numbered elements in the input build vector are obtained from
7238	// subtracting two integer/float elements.
7239	unsigned ExpectedOpcode = ISD::FSUB;
7240	unsigned NextExpectedOpcode = ISD::FADD;
7241	bool AddFound = false;
7242	bool SubFound = false;
7243
7244	for (unsigned i = 0, e = NumElts; i != e; ++i) {
7245	SDValue Op = BV->getOperand(i);
7246
7247	// Skip 'undef' values.
7248	unsigned Opcode = Op.getOpcode();
7249	if (Opcode == ISD::UNDEF) {
7250	std::swap(ExpectedOpcode, NextExpectedOpcode);
7251	continue;
7252	}
7253
7254	// Early exit if we found an unexpected opcode.
7255	if (Opcode != ExpectedOpcode)
7256	return false;
7257
7258	SDValue Op0 = Op.getOperand(0);
7259	SDValue Op1 = Op.getOperand(1);
7260
7261	// Try to match the following pattern:
7262	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7263	// Early exit if we cannot match that sequence.
7264	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
7265	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
7266	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
7267	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
7268	Op0.getOperand(1) != Op1.getOperand(1))
7269	return false;
7270
7271	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7272	if (I0 != i)
7273	return false;
7274
7275	// We found a valid add/sub node. Update the information accordingly.
7276	if (i & 1)
7277	AddFound = true;
7278	else
7279	SubFound = true;
7280
7281	// Update InVec0 and InVec1.
7282	if (InVec0.isUndef()) {
7283	InVec0 = Op0.getOperand(0);
7284	if (InVec0.getSimpleValueType() != VT)
7285	return false;
7286	}
7287	if (InVec1.isUndef()) {
7288	InVec1 = Op1.getOperand(0);
7289	if (InVec1.getSimpleValueType() != VT)
7290	return false;
7291	}
7292
7293	// Make sure that operands in input to each add/sub node always
7294	// come from a same pair of vectors.
7295	if (InVec0 != Op0.getOperand(0)) {
7296	if (ExpectedOpcode == ISD::FSUB)
7297	return false;
7298
7299	// FADD is commutable. Try to commute the operands
7300	// and then test again.
7301	std::swap(Op0, Op1);
7302	if (InVec0 != Op0.getOperand(0))
7303	return false;
7304	}
7305
7306	if (InVec1 != Op1.getOperand(0))
7307	return false;
7308
7309	// Update the pair of expected opcodes.
7310	std::swap(ExpectedOpcode, NextExpectedOpcode);
7311	}
7312
7313	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7314	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
7315	return false;
7316
7317	Opnd0 = InVec0;
7318	Opnd1 = InVec1;
7319	return true;
7320	}
7321
7322	/// Returns true if is possible to fold MUL and an idiom that has already been
7323	/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7324	/// If (and only if) true is returned, the operands of FMADDSUB are written to
7325	/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7326	///
7327	/// Prior to calling this function it should be known that there is some
7328	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7329	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7330	/// before replacement of such SDNode with ADDSUB operation. Thus the number
7331	/// of \p Opnd0 uses is expected to be equal to 2.
7332	/// For example, this function may be called for the following IR:
7333	/// %AB = fmul fast <2 x double> %A, %B
7334	/// %Sub = fsub fast <2 x double> %AB, %C
7335	/// %Add = fadd fast <2 x double> %AB, %C
7336	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7337	/// <2 x i32> <i32 0, i32 3>
7338	/// There is a def for %Addsub here, which potentially can be replaced by
7339	/// X86ISD::ADDSUB operation:
7340	/// %Addsub = X86ISD::ADDSUB %AB, %C
7341	/// and such ADDSUB can further be replaced with FMADDSUB:
7342	/// %Addsub = FMADDSUB %A, %B, %C.
7343	///
7344	/// The main reason why this method is called before the replacement of the
7345	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7346	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7347	/// FMADDSUB is.
7348	static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7349	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7350	if (Opnd0.getOpcode() != ISD::FMUL \|\| Opnd0->use_size() != 2 \|\|
7351	!Subtarget.hasAnyFMA())
7352	return false;
7353
7354	// FIXME: These checks must match the similar ones in
7355	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
7356	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
7357	// or MUL + ADDSUB to FMADDSUB.
7358	const TargetOptions &Options = DAG.getTarget().Options;
7359	bool AllowFusion =
7360	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
7361	if (!AllowFusion)
7362	return false;
7363
7364	Opnd2 = Opnd1;
7365	Opnd1 = Opnd0.getOperand(1);
7366	Opnd0 = Opnd0.getOperand(0);
7367
7368	return true;
7369	}
7370
7371	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7372	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7373	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7374	const X86Subtarget &Subtarget,
7375	SelectionDAG &DAG) {
7376	SDValue Opnd0, Opnd1;
7377	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7378	return SDValue();
7379
7380	MVT VT = BV->getSimpleValueType(0);
7381	SDLoc DL(BV);
7382
7383	// Try to generate X86ISD::FMADDSUB node here.
7384	SDValue Opnd2;
7385	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7386	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7387
7388	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
7389	// the ADDSUB idiom has been successfully recognized. There are no known
7390	// X86 targets with 512-bit ADDSUB instructions!
7391	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7392	// recognition.
7393	if (VT.is512BitVector())
7394	return SDValue();
7395
7396	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7397	}
7398
7399	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7400	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7401	const X86Subtarget &Subtarget,
7402	SelectionDAG &DAG) {
7403	MVT VT = BV->getSimpleValueType(0);
7404	unsigned NumElts = VT.getVectorNumElements();
7405	unsigned NumUndefsLO = 0;
7406	unsigned NumUndefsHI = 0;
7407	unsigned Half = NumElts/2;
7408
7409	// Count the number of UNDEF operands in the build_vector in input.
7410	for (unsigned i = 0, e = Half; i != e; ++i)
7411	if (BV->getOperand(i)->isUndef())
7412	NumUndefsLO++;
7413
7414	for (unsigned i = Half, e = NumElts; i != e; ++i)
7415	if (BV->getOperand(i)->isUndef())
7416	NumUndefsHI++;
7417
7418	// Early exit if this is either a build_vector of all UNDEFs or all the
7419	// operands but one are UNDEF.
7420	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7421	return SDValue();
7422
7423	SDLoc DL(BV);
7424	SDValue InVec0, InVec1;
7425	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7426	// Try to match an SSE3 float HADD/HSUB.
7427	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7428	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7429
7430	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7431	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7432	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7433	// Try to match an SSSE3 integer HADD/HSUB.
7434	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7435	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7436
7437	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7438	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7439	}
7440
7441	if (!Subtarget.hasAVX())
7442	return SDValue();
7443
7444	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
7445	// Try to match an AVX horizontal add/sub of packed single/double
7446	// precision floating point values from 256-bit vectors.
7447	SDValue InVec2, InVec3;
7448	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7449	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7450	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
7451	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
7452	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7453
7454	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7455	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7456	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
7457	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
7458	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7459	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
7460	// Try to match an AVX2 horizontal add/sub of signed integers.
7461	SDValue InVec2, InVec3;
7462	unsigned X86Opcode;
7463	bool CanFold = true;
7464
7465	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7466	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7467	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
7468	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
7469	X86Opcode = X86ISD::HADD;
7470	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7471	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7472	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
7473	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
7474	X86Opcode = X86ISD::HSUB;
7475	else
7476	CanFold = false;
7477
7478	if (CanFold) {
7479	// Fold this build_vector into a single horizontal add/sub.
7480	// Do this only if the target has AVX2.
7481	if (Subtarget.hasAVX2())
7482	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7483
7484	// Do not try to expand this build_vector into a pair of horizontal
7485	// add/sub if we can emit a pair of scalar add/sub.
7486	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
7487	return SDValue();
7488
7489	// Convert this build_vector into a pair of horizontal binop followed by
7490	// a concat vector.
7491	bool isUndefLO = NumUndefsLO == Half;
7492	bool isUndefHI = NumUndefsHI == Half;
7493	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7494	isUndefLO, isUndefHI);
7495	}
7496	}
7497
7498	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
7499	VT == MVT::v16i16) && Subtarget.hasAVX()) {
7500	unsigned X86Opcode;
7501	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7502	X86Opcode = X86ISD::HADD;
7503	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7504	X86Opcode = X86ISD::HSUB;
7505	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7506	X86Opcode = X86ISD::FHADD;
7507	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7508	X86Opcode = X86ISD::FHSUB;
7509	else
7510	return SDValue();
7511
7512	// Don't try to expand this build_vector into a pair of horizontal add/sub
7513	// if we can simply emit a pair of scalar add/sub.
7514	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
7515	return SDValue();
7516
7517	// Convert this build_vector into two horizontal add/sub followed by
7518	// a concat vector.
7519	bool isUndefLO = NumUndefsLO == Half;
7520	bool isUndefHI = NumUndefsHI == Half;
7521	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7522	isUndefLO, isUndefHI);
7523	}
7524
7525	return SDValue();
7526	}
7527
7528	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
7529	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7530	/// just apply the bit to the vectors.
7531	/// NOTE: Its not in our interest to start make a general purpose vectorizer
7532	/// from this, but enough scalar bit operations are created from the later
7533	/// legalization + scalarization stages to need basic support.
7534	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7535	SelectionDAG &DAG) {
7536	SDLoc DL(Op);
7537	MVT VT = Op->getSimpleValueType(0);
7538	unsigned NumElems = VT.getVectorNumElements();
7539	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7540
7541	// Check that all elements have the same opcode.
7542	// TODO: Should we allow UNDEFS and if so how many?
7543	unsigned Opcode = Op->getOperand(0).getOpcode();
7544	for (unsigned i = 1; i < NumElems; ++i)
7545	if (Opcode != Op->getOperand(i).getOpcode())
7546	return SDValue();
7547
7548	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7549	switch (Opcode) {
7550	default:
7551	return SDValue();
7552	case ISD::AND:
7553	case ISD::XOR:
7554	case ISD::OR:
7555	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7556	return SDValue();
7557	break;
7558	}
7559
7560	SmallVector<SDValue, 4> LHSElts, RHSElts;
7561	for (SDValue Elt : Op->ops()) {
7562	SDValue LHS = Elt.getOperand(0);
7563	SDValue RHS = Elt.getOperand(1);
7564
7565	// We expect the canonicalized RHS operand to be the constant.
7566	if (!isa<ConstantSDNode>(RHS))
7567	return SDValue();
7568	LHSElts.push_back(LHS);
7569	RHSElts.push_back(RHS);
7570	}
7571
7572	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7573	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7574	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7575	}
7576
7577	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
7578	/// functionality to do this, so it's all zeros, all ones, or some derivation
7579	/// that is cheap to calculate.
7580	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7581	const X86Subtarget &Subtarget) {
7582	SDLoc DL(Op);
7583	MVT VT = Op.getSimpleValueType();
7584
7585	// Vectors containing all zeros can be matched by pxor and xorps.
7586	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7587	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7588	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7589	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
7590	return Op;
7591
7592	return getZeroVector(VT, Subtarget, DAG, DL);
7593	}
7594
7595	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7596	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7597	// vpcmpeqd on 256-bit vectors.
7598	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7599	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
7600	(VT == MVT::v8i32 && Subtarget.hasInt256()))
7601	return Op;
7602
7603	return getOnesVector(VT, DAG, DL);
7604	}
7605
7606	return SDValue();
7607	}
7608
7609	SDValue
7610	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7611	SDLoc dl(Op);
7612
7613	MVT VT = Op.getSimpleValueType();
7614	MVT ExtVT = VT.getVectorElementType();
7615	unsigned NumElems = Op.getNumOperands();
7616
7617	// Generate vectors for predicate vectors.
7618	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7619	return LowerBUILD_VECTORvXi1(Op, DAG);
7620
7621	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7622	return VectorConstant;
7623
7624	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7625	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7626	return AddSub;
7627	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7628	return HorizontalOp;
7629	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7630	return Broadcast;
7631	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7632	return BitOp;
7633
7634	unsigned EVTBits = ExtVT.getSizeInBits();
7635
7636	unsigned NumZero = 0;
7637	unsigned NumNonZero = 0;
7638	uint64_t NonZeros = 0;
7639	bool IsAllConstants = true;
7640	SmallSet<SDValue, 8> Values;
7641	for (unsigned i = 0; i < NumElems; ++i) {
7642	SDValue Elt = Op.getOperand(i);
7643	if (Elt.isUndef())
7644	continue;
7645	Values.insert(Elt);
7646	if (Elt.getOpcode() != ISD::Constant &&
7647	Elt.getOpcode() != ISD::ConstantFP)
7648	IsAllConstants = false;
7649	if (X86::isZeroNode(Elt))
7650	NumZero++;
7651	else {
7652	assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) : __assert_fail ("i < sizeof(NonZeros) * 8", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7652, __PRETTY_FUNCTION__)); // Make sure the shift is within range.
7653	NonZeros \|= ((uint64_t)1 << i);
7654	NumNonZero++;
7655	}
7656	}
7657
7658	// All undef vector. Return an UNDEF. All zero vectors were handled above.
7659	if (NumNonZero == 0)
7660	return DAG.getUNDEF(VT);
7661
7662	// Special case for single non-zero, non-undef, element.
7663	if (NumNonZero == 1) {
7664	unsigned Idx = countTrailingZeros(NonZeros);
7665	SDValue Item = Op.getOperand(Idx);
7666
7667	// If this is an insertion of an i64 value on x86-32, and if the top bits of
7668	// the value are obviously zero, truncate the value to i32 and do the
7669	// insertion that way. Only do this if the value is non-constant or if the
7670	// value is a constant being inserted into element 0. It is cheaper to do
7671	// a constant pool load than it is to do a movd + shuffle.
7672	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7673	(!IsAllConstants \|\| Idx == 0)) {
7674	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7675	// Handle SSE only.
7676	assert(VT == MVT::v2i64 && "Expected an SSE value type!")((VT == MVT::v2i64 && "Expected an SSE value type!") ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2i64 && \"Expected an SSE value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7676, __PRETTY_FUNCTION__));
7677	MVT VecVT = MVT::v4i32;
7678
7679	// Truncate the value (which may itself be a constant) to i32, and
7680	// convert it to a vector with movd (S2V+shuffle to zero extend).
7681	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7682	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7683	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7684	Item, Idx * 2, true, Subtarget, DAG));
7685	}
7686	}
7687
7688	// If we have a constant or non-constant insertion into the low element of
7689	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7690	// the rest of the elements. This will be matched as movd/movq/movss/movsd
7691	// depending on what the source datatype is.
7692	if (Idx == 0) {
7693	if (NumZero == 0)
7694	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7695
7696	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
7697	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7698	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Expected an SSE value type!") ? static_cast< void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Expected an SSE value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7700, __PRETTY_FUNCTION__))
7699	VT.is512BitVector()) &&(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Expected an SSE value type!") ? static_cast< void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Expected an SSE value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7700, __PRETTY_FUNCTION__))
7700	"Expected an SSE value type!")(((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector ()) && "Expected an SSE value type!") ? static_cast< void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) && \"Expected an SSE value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7700, __PRETTY_FUNCTION__));
7701	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7702	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7703	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7704	}
7705
7706	// We can't directly insert an i8 or i16 into a vector, so zero extend
7707	// it to i32 first.
7708	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
7709	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7710	if (VT.getSizeInBits() >= 256) {
7711	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7712	if (Subtarget.hasAVX()) {
7713	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7714	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7715	} else {
7716	// Without AVX, we need to extend to a 128-bit vector and then
7717	// insert into the 256-bit vector.
7718	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7719	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7720	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7721	}
7722	} else {
7723	assert(VT.is128BitVector() && "Expected an SSE value type!")((VT.is128BitVector() && "Expected an SSE value type!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7723, __PRETTY_FUNCTION__));
7724	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7725	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7726	}
7727	return DAG.getBitcast(VT, Item);
7728	}
7729	}
7730
7731	// Is it a vector logical left shift?
7732	if (NumElems == 2 && Idx == 1 &&
7733	X86::isZeroNode(Op.getOperand(0)) &&
7734	!X86::isZeroNode(Op.getOperand(1))) {
7735	unsigned NumBits = VT.getSizeInBits();
7736	return getVShift(true, VT,
7737	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7738	VT, Op.getOperand(1)),
7739	NumBits/2, DAG, *this, dl);
7740	}
7741
7742	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7743	return SDValue();
7744
7745	// Otherwise, if this is a vector with i32 or f32 elements, and the element
7746	// is a non-constant being inserted into an element other than the low one,
7747	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7748	// movd/movss) to move this into the low element, then shuffle it into
7749	// place.
7750	if (EVTBits == 32) {
7751	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7752	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7753	}
7754	}
7755
7756	// Splat is obviously ok. Let legalizer expand it to a shuffle.
7757	if (Values.size() == 1) {
7758	if (EVTBits == 32) {
7759	// Instead of a shuffle like this:
7760	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7761	// Check if it's possible to issue this instead.
7762	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
7763	unsigned Idx = countTrailingZeros(NonZeros);
7764	SDValue Item = Op.getOperand(Idx);
7765	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7766	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7767	}
7768	return SDValue();
7769	}
7770
7771	// A vector full of immediates; various special cases are already
7772	// handled, so this is best done with a single constant-pool load.
7773	if (IsAllConstants)
7774	return SDValue();
7775
7776	// See if we can use a vector load to get all of the elements.
7777	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
7778	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7779	if (SDValue LD =
7780	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7781	return LD;
7782	}
7783
7784	// For AVX-length vectors, build the individual 128-bit pieces and use
7785	// shuffles to put them in place.
7786	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
7787	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7788
7789	EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7790
7791	// Build both the lower and upper subvector.
7792	SDValue Lower =
7793	DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7794	SDValue Upper = DAG.getBuildVector(
7795	HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7796
7797	// Recreate the wider vector with the lower and upper part.
7798	if (VT.is256BitVector())
7799	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7800	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7801	}
7802
7803	// Let legalizer expand 2-wide build_vectors.
7804	if (EVTBits == 64) {
7805	if (NumNonZero == 1) {
7806	// One half is zero or undef.
7807	unsigned Idx = countTrailingZeros(NonZeros);
7808	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7809	Op.getOperand(Idx));
7810	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7811	}
7812	return SDValue();
7813	}
7814
7815	// If element VT is < 32 bits, convert it to inserts into a zero vector.
7816	if (EVTBits == 8 && NumElems == 16)
7817	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7818	DAG, Subtarget))
7819	return V;
7820
7821	if (EVTBits == 16 && NumElems == 8)
7822	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7823	DAG, Subtarget))
7824	return V;
7825
7826	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7827	if (EVTBits == 32 && NumElems == 4)
7828	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7829	return V;
7830
7831	// If element VT is == 32 bits, turn it into a number of shuffles.
7832	if (NumElems == 4 && NumZero > 0) {
7833	SmallVector<SDValue, 8> Ops(NumElems);
7834	for (unsigned i = 0; i < 4; ++i) {
7835	bool isZero = !(NonZeros & (1ULL << i));
7836	if (isZero)
7837	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7838	else
7839	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7840	}
7841
7842	for (unsigned i = 0; i < 2; ++i) {
7843	switch ((NonZeros & (0x3 << i2)) >> (i2)) {
7844	default: break;
7845	case 0:
7846	Ops[i] = Ops[i*2]; // Must be a zero vector.
7847	break;
7848	case 1:
7849	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
7850	break;
7851	case 2:
7852	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
7853	break;
7854	case 3:
7855	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
7856	break;
7857	}
7858	}
7859
7860	bool Reverse1 = (NonZeros & 0x3) == 2;
7861	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7862	int MaskVec[] = {
7863	Reverse1 ? 1 : 0,
7864	Reverse1 ? 0 : 1,
7865	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7866	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7867	};
7868	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7869	}
7870
7871	if (Values.size() > 1 && VT.is128BitVector()) {
7872	// Check for a build vector from mostly shuffle plus few inserting.
7873	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7874	return Sh;
7875
7876	// For SSE 4.1, use insertps to put the high elements into the low element.
7877	if (Subtarget.hasSSE41()) {
7878	SDValue Result;
7879	if (!Op.getOperand(0).isUndef())
7880	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7881	else
7882	Result = DAG.getUNDEF(VT);
7883
7884	for (unsigned i = 1; i < NumElems; ++i) {
7885	if (Op.getOperand(i).isUndef()) continue;
7886	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7887	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7888	}
7889	return Result;
7890	}
7891
7892	// Otherwise, expand into a number of unpckl*, start by extending each of
7893	// our (non-undef) elements to the full vector width with the element in the
7894	// bottom slot of the vector (which generates no code for SSE).
7895	SmallVector<SDValue, 8> Ops(NumElems);
7896	for (unsigned i = 0; i < NumElems; ++i) {
7897	if (!Op.getOperand(i).isUndef())
7898	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7899	else
7900	Ops[i] = DAG.getUNDEF(VT);
7901	}
7902
7903	// Next, we iteratively mix elements, e.g. for v4f32:
7904	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7905	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7906	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
7907	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7908	// Generate scaled UNPCKL shuffle mask.
7909	SmallVector<int, 16> Mask;
7910	for(unsigned i = 0; i != Scale; ++i)
7911	Mask.push_back(i);
7912	for (unsigned i = 0; i != Scale; ++i)
7913	Mask.push_back(NumElems+i);
7914	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7915
7916	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7917	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
7918	}
7919	return Ops[0];
7920	}
7921	return SDValue();
7922	}
7923
7924	// 256-bit AVX can use the vinsertf128 instruction
7925	// to create 256-bit vectors from two other 128-bit ones.
7926	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7927	SDLoc dl(Op);
7928	MVT ResVT = Op.getSimpleValueType();
7929
7930	assert((ResVT.is256BitVector() \|\|(((ResVT.is256BitVector() \|\| ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide") ? static_cast<void > (0) : __assert_fail ("(ResVT.is256BitVector() \|\| ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7931, __PRETTY_FUNCTION__))
7931	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() \|\| ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide") ? static_cast<void > (0) : __assert_fail ("(ResVT.is256BitVector() \|\| ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7931, __PRETTY_FUNCTION__));
7932
7933	SDValue V1 = Op.getOperand(0);
7934	SDValue V2 = Op.getOperand(1);
7935	unsigned NumElems = ResVT.getVectorNumElements();
7936	if (ResVT.is256BitVector())
7937	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7938
7939	if (Op.getNumOperands() == 4) {
7940	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7941	ResVT.getVectorNumElements()/2);
7942	SDValue V3 = Op.getOperand(2);
7943	SDValue V4 = Op.getOperand(3);
7944	return concat256BitVectors(
7945	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7946	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7947	NumElems, DAG, dl);
7948	}
7949	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7950	}
7951
7952	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
7953	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
7954	static bool isExpandWithZeros(const SDValue &Op) {
7955	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&((Op.getOpcode() == ISD::CONCAT_VECTORS && "Expand with zeros only possible in CONCAT_VECTORS nodes!" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7956, __PRETTY_FUNCTION__))
7956	"Expand with zeros only possible in CONCAT_VECTORS nodes!")((Op.getOpcode() == ISD::CONCAT_VECTORS && "Expand with zeros only possible in CONCAT_VECTORS nodes!" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7956, __PRETTY_FUNCTION__));
7957
7958	for (unsigned i = 1; i < Op.getNumOperands(); i++)
7959	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
7960	return false;
7961
7962	return true;
7963	}
7964
7965	// Returns true if the given node is a type promotion (by concatenating i1
7966	// zeros) of the result of a node that already zeros all upper bits of
7967	// k-register.
7968	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
7969	unsigned Opc = Op.getOpcode();
7970
7971	assert(Opc == ISD::CONCAT_VECTORS &&((Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType ().getVectorElementType() == MVT::i1 && "Unexpected node to check for type promotion!" ) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7973, __PRETTY_FUNCTION__))
7972	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType ().getVectorElementType() == MVT::i1 && "Unexpected node to check for type promotion!" ) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7973, __PRETTY_FUNCTION__))
7973	"Unexpected node to check for type promotion!")((Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType ().getVectorElementType() == MVT::i1 && "Unexpected node to check for type promotion!" ) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 7973, __PRETTY_FUNCTION__));
7974
7975	// As long as we are concatenating zeros to the upper part of a previous node
7976	// result, climb up the tree until a node with different opcode is
7977	// encountered
7978	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
7979	if (Opc == ISD::INSERT_SUBVECTOR) {
7980	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
7981	Op.getConstantOperandVal(2) == 0)
7982	Op = Op.getOperand(1);
7983	else
7984	return SDValue();
7985	} else { // Opc == ISD::CONCAT_VECTORS
7986	if (isExpandWithZeros(Op))
7987	Op = Op.getOperand(0);
7988	else
7989	return SDValue();
7990	}
7991	Opc = Op.getOpcode();
7992	}
7993
7994	// Check if the first inserted node zeroes the upper bits, or an 'and' result
7995	// of a node that zeros the upper bits (its masked version).
7996	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
7997	(Op.getOpcode() == ISD::AND &&
7998	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
7999	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8000	return Op;
8001	}
8002
8003	return SDValue();
8004	}
8005
8006	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8007	const X86Subtarget &Subtarget,
8008	SelectionDAG & DAG) {
8009	SDLoc dl(Op);
8010	MVT ResVT = Op.getSimpleValueType();
8011	unsigned NumOfOperands = Op.getNumOperands();
8012
8013	assert(isPowerOf2_32(NumOfOperands) &&((isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumOfOperands) && \"Unexpected number of operands in CONCAT_VECTORS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8014, __PRETTY_FUNCTION__))
8014	"Unexpected number of operands in CONCAT_VECTORS")((isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumOfOperands) && \"Unexpected number of operands in CONCAT_VECTORS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8014, __PRETTY_FUNCTION__));
8015
8016	// If this node promotes - by concatenating zeroes - the type of the result
8017	// of a node with instruction that zeroes all upper (irrelevant) bits of the
8018	// output register, mark it as legal and catch the pattern in instruction
8019	// selection to avoid emitting extra insturctions (for zeroing upper bits).
8020	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8021	SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8022	SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8023	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8024	ZeroC);
8025	}
8026
8027	SDValue Undef = DAG.getUNDEF(ResVT);
8028	if (NumOfOperands > 2) {
8029	// Specialize the cases when all, or all but one, of the operands are undef.
8030	unsigned NumOfDefinedOps = 0;
8031	unsigned OpIdx = 0;
8032	for (unsigned i = 0; i < NumOfOperands; i++)
8033	if (!Op.getOperand(i).isUndef()) {
8034	NumOfDefinedOps++;
8035	OpIdx = i;
8036	}
8037	if (NumOfDefinedOps == 0)
8038	return Undef;
8039	if (NumOfDefinedOps == 1) {
8040	unsigned SubVecNumElts =
8041	Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8042	SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8043	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8044	Op.getOperand(OpIdx), IdxVal);
8045	}
8046
8047	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8048	ResVT.getVectorNumElements()/2);
8049	SmallVector<SDValue, 2> Ops;
8050	for (unsigned i = 0; i < NumOfOperands/2; i++)
8051	Ops.push_back(Op.getOperand(i));
8052	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8053	Ops.clear();
8054	for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8055	Ops.push_back(Op.getOperand(i));
8056	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8057	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8058	}
8059
8060	// 2 operands
8061	SDValue V1 = Op.getOperand(0);
8062	SDValue V2 = Op.getOperand(1);
8063	unsigned NumElems = ResVT.getVectorNumElements();
8064	assert(V1.getValueType() == V2.getValueType() &&((V1.getValueType() == V2.getValueType() && V1.getValueType ().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS" ) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && V1.getValueType().getVectorNumElements() == NumElems/2 && \"Unexpected operands in CONCAT_VECTORS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8066, __PRETTY_FUNCTION__))
8065	V1.getValueType().getVectorNumElements() == NumElems/2 &&((V1.getValueType() == V2.getValueType() && V1.getValueType ().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS" ) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && V1.getValueType().getVectorNumElements() == NumElems/2 && \"Unexpected operands in CONCAT_VECTORS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8066, __PRETTY_FUNCTION__))
8066	"Unexpected operands in CONCAT_VECTORS")((V1.getValueType() == V2.getValueType() && V1.getValueType ().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS" ) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && V1.getValueType().getVectorNumElements() == NumElems/2 && \"Unexpected operands in CONCAT_VECTORS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8066, __PRETTY_FUNCTION__));
8067
8068	if (ResVT.getSizeInBits() >= 16)
8069	return Op; // The operation is legal with KUNPCK
8070
8071	bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8072	bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8073	SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8074	if (IsZeroV1 && IsZeroV2)
8075	return ZeroVec;
8076
8077	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8078	if (V2.isUndef())
8079	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8080	if (IsZeroV2)
8081	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8082
8083	SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8084	if (V1.isUndef())
8085	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8086
8087	if (IsZeroV1)
8088	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8089
8090	V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8091	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8092	}
8093
8094	static SDValue LowerCONCAT_VECTORS(SDValue Op,
8095	const X86Subtarget &Subtarget,
8096	SelectionDAG &DAG) {
8097	MVT VT = Op.getSimpleValueType();
8098	if (VT.getVectorElementType() == MVT::i1)
8099	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8100
8101	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|(((VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8103, __PRETTY_FUNCTION__))
8102	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|(((VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8103, __PRETTY_FUNCTION__))
8103	Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) \|\| (VT.is512BitVector() && (Op.getNumOperands() == 2 \|\| Op.getNumOperands() == 4))" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8103, __PRETTY_FUNCTION__));
8104
8105	// AVX can use the vinsertf128 instruction to create 256-bit vectors
8106	// from two other 128-bit ones.
8107
8108	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8109	return LowerAVXCONCAT_VECTORS(Op, DAG);
8110	}
8111
8112	//===----------------------------------------------------------------------===//
8113	// Vector shuffle lowering
8114	//
8115	// This is an experimental code path for lowering vector shuffles on x86. It is
8116	// designed to handle arbitrary vector shuffles and blends, gracefully
8117	// degrading performance as necessary. It works hard to recognize idiomatic
8118	// shuffles and lower them to optimal instruction patterns without leaving
8119	// a framework that allows reasonably efficient handling of all vector shuffle
8120	// patterns.
8121	//===----------------------------------------------------------------------===//
8122
8123	/// \brief Tiny helper function to identify a no-op mask.
8124	///
8125	/// This is a somewhat boring predicate function. It checks whether the mask
8126	/// array input, which is assumed to be a single-input shuffle mask of the kind
8127	/// used by the X86 shuffle instructions (not a fully general
8128	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8129	/// in-place shuffle are 'no-op's.
8130	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8131	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8132	assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ? static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8132, __PRETTY_FUNCTION__));
8133	if (Mask[i] >= 0 && Mask[i] != i)
8134	return false;
8135	}
8136	return true;
8137	}
8138
8139	/// \brief Test whether there are elements crossing 128-bit lanes in this
8140	/// shuffle mask.
8141	///
8142	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8143	/// and we routinely test for these.
8144	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8145	int LaneSize = 128 / VT.getScalarSizeInBits();
8146	int Size = Mask.size();
8147	for (int i = 0; i < Size; ++i)
8148	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8149	return true;
8150	return false;
8151	}
8152
8153	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8154	///
8155	/// This checks a shuffle mask to see if it is performing the same
8156	/// lane-relative shuffle in each sub-lane. This trivially implies
8157	/// that it is also not lane-crossing. It may however involve a blend from the
8158	/// same lane of a second vector.
8159	///
8160	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8161	/// non-trivial to compute in the face of undef lanes. The representation is
8162	/// suitable for use with existing 128-bit shuffles as entries from the second
8163	/// vector have been remapped to [LaneSize, 2*LaneSize).
8164	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8165	ArrayRef<int> Mask,
8166	SmallVectorImpl<int> &RepeatedMask) {
8167	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8168	RepeatedMask.assign(LaneSize, -1);
8169	int Size = Mask.size();
8170	for (int i = 0; i < Size; ++i) {
8171	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0)((Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0) ? static_cast <void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8171, __PRETTY_FUNCTION__));
8172	if (Mask[i] < 0)
8173	continue;
8174	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8175	// This entry crosses lanes, so there is no way to model this shuffle.
8176	return false;
8177
8178	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
8179	// Adjust second vector indices to start at LaneSize instead of Size.
8180	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8181	: Mask[i] % LaneSize + LaneSize;
8182	if (RepeatedMask[i % LaneSize] < 0)
8183	// This is the first non-undef entry in this slot of a 128-bit lane.
8184	RepeatedMask[i % LaneSize] = LocalM;
8185	else if (RepeatedMask[i % LaneSize] != LocalM)
8186	// Found a mismatch with the repeated mask.
8187	return false;
8188	}
8189	return true;
8190	}
8191
8192	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
8193	static bool
8194	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8195	SmallVectorImpl<int> &RepeatedMask) {
8196	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8197	}
8198
8199	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
8200	static bool
8201	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8202	SmallVectorImpl<int> &RepeatedMask) {
8203	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8204	}
8205
8206	/// Test whether a target shuffle mask is equivalent within each sub-lane.
8207	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8208	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8209	ArrayRef<int> Mask,
8210	SmallVectorImpl<int> &RepeatedMask) {
8211	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8212	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8213	int Size = Mask.size();
8214	for (int i = 0; i < Size; ++i) {
8215	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0))((isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0)) ? static_cast< void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0)" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8215, __PRETTY_FUNCTION__));
8216	if (Mask[i] == SM_SentinelUndef)
8217	continue;
8218	if (Mask[i] == SM_SentinelZero) {
8219	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8220	return false;
8221	RepeatedMask[i % LaneSize] = SM_SentinelZero;
8222	continue;
8223	}
8224	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8225	// This entry crosses lanes, so there is no way to model this shuffle.
8226	return false;
8227
8228	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
8229	// Adjust second vector indices to start at LaneSize instead of Size.
8230	int LocalM =
8231	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8232	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8233	// This is the first non-undef entry in this slot of a 128-bit lane.
8234	RepeatedMask[i % LaneSize] = LocalM;
8235	else if (RepeatedMask[i % LaneSize] != LocalM)
8236	// Found a mismatch with the repeated mask.
8237	return false;
8238	}
8239	return true;
8240	}
8241
8242	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8243	/// arguments.
8244	///
8245	/// This is a fast way to test a shuffle mask against a fixed pattern:
8246	///
8247	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8248	///
8249	/// It returns true if the mask is exactly as wide as the argument list, and
8250	/// each element of the mask is either -1 (signifying undef) or the value given
8251	/// in the argument.
8252	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8253	ArrayRef<int> ExpectedMask) {
8254	if (Mask.size() != ExpectedMask.size())
8255	return false;
8256
8257	int Size = Mask.size();
8258
8259	// If the values are build vectors, we can look through them to find
8260	// equivalent inputs that make the shuffles equivalent.
8261	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8262	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8263
8264	for (int i = 0; i < Size; ++i) {
8265	assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ? static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8265, __PRETTY_FUNCTION__));
8266	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8267	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8268	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8269	if (!MaskBV \|\| !ExpectedBV \|\|
8270	MaskBV->getOperand(Mask[i] % Size) !=
8271	ExpectedBV->getOperand(ExpectedMask[i] % Size))
8272	return false;
8273	}
8274	}
8275
8276	return true;
8277	}
8278
8279	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8280	///
8281	/// The masks must be exactly the same width.
8282	///
8283	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8284	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
8285	///
8286	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
8287	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8288	ArrayRef<int> ExpectedMask) {
8289	int Size = Mask.size();
8290	if (Size != (int)ExpectedMask.size())
8291	return false;
8292
8293	for (int i = 0; i < Size; ++i)
8294	if (Mask[i] == SM_SentinelUndef)
8295	continue;
8296	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8297	return false;
8298	else if (Mask[i] != ExpectedMask[i])
8299	return false;
8300
8301	return true;
8302	}
8303
8304	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8305	// mask.
8306	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8307	const APInt &Zeroable) {
8308	int NumElts = Mask.size();
8309	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes")((NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes" ) ? static_cast<void> (0) : __assert_fail ("NumElts == (int)Zeroable.getBitWidth() && \"Mismatch mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8309, __PRETTY_FUNCTION__));
8310
8311	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8312	for (int i = 0; i != NumElts; ++i) {
8313	int M = Mask[i];
8314	if (M == SM_SentinelUndef)
8315	continue;
8316	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index")((0 <= M && M < (2 * NumElts) && "Out of range shuffle index" ) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < (2 * NumElts) && \"Out of range shuffle index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8316, __PRETTY_FUNCTION__));
8317	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8318	}
8319	return TargetMask;
8320	}
8321
8322	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8323	// instructions.
8324	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8325	if (VT != MVT::v8i32 && VT != MVT::v8f32)
8326	return false;
8327
8328	SmallVector<int, 8> Unpcklwd;
8329	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8330	/* Unary = */ false);
8331	SmallVector<int, 8> Unpckhwd;
8332	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8333	/* Unary = */ false);
8334	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
8335	isTargetShuffleEquivalent(Mask, Unpckhwd));
8336	return IsUnpackwdMask;
8337	}
8338
8339	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8340	///
8341	/// This helper function produces an 8-bit shuffle immediate corresponding to
8342	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
8343	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8344	/// example.
8345	///
8346	/// NB: We rely heavily on "undef" masks preserving the input lane.
8347	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8348	assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast <void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8348, __PRETTY_FUNCTION__));
8349	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8349, __PRETTY_FUNCTION__));
8350	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8350, __PRETTY_FUNCTION__));
8351	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8351, __PRETTY_FUNCTION__));
8352	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8352, __PRETTY_FUNCTION__));
8353
8354	unsigned Imm = 0;
8355	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8356	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8357	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8358	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8359	return Imm;
8360	}
8361
8362	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8363	SelectionDAG &DAG) {
8364	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8365	}
8366
8367	/// \brief Compute whether each element of a shuffle is zeroable.
8368	///
8369	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8370	/// Either it is an undef element in the shuffle mask, the element of the input
8371	/// referenced is undef, or the element of the input referenced is known to be
8372	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8373	/// as many lanes with this technique as possible to simplify the remaining
8374	/// shuffle.
8375	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8376	SDValue V1, SDValue V2) {
8377	APInt Zeroable(Mask.size(), 0);
8378	V1 = peekThroughBitcasts(V1);
8379	V2 = peekThroughBitcasts(V2);
8380
8381	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8382	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8383
8384	int VectorSizeInBits = V1.getValueSizeInBits();
8385	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8386	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size" ) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8386, __PRETTY_FUNCTION__));
8387
8388	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8389	int M = Mask[i];
8390	// Handle the easy cases.
8391	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
8392	Zeroable.setBit(i);
8393	continue;
8394	}
8395
8396	// Determine shuffle input and normalize the mask.
8397	SDValue V = M < Size ? V1 : V2;
8398	M %= Size;
8399
8400	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8401	if (V.getOpcode() != ISD::BUILD_VECTOR)
8402	continue;
8403
8404	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8405	// the (larger) source element must be UNDEF/ZERO.
8406	if ((Size % V.getNumOperands()) == 0) {
8407	int Scale = Size / V->getNumOperands();
8408	SDValue Op = V.getOperand(M / Scale);
8409	if (Op.isUndef() \|\| X86::isZeroNode(Op))
8410	Zeroable.setBit(i);
8411	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8412	APInt Val = Cst->getAPIntValue();
8413	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8414	Val = Val.getLoBits(ScalarSizeInBits);
8415	if (Val == 0)
8416	Zeroable.setBit(i);
8417	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8418	APInt Val = Cst->getValueAPF().bitcastToAPInt();
8419	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8420	Val = Val.getLoBits(ScalarSizeInBits);
8421	if (Val == 0)
8422	Zeroable.setBit(i);
8423	}
8424	continue;
8425	}
8426
8427	// If the BUILD_VECTOR has more elements then all the (smaller) source
8428	// elements must be UNDEF or ZERO.
8429	if ((V.getNumOperands() % Size) == 0) {
8430	int Scale = V->getNumOperands() / Size;
8431	bool AllZeroable = true;
8432	for (int j = 0; j < Scale; ++j) {
8433	SDValue Op = V.getOperand((M * Scale) + j);
8434	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
8435	}
8436	if (AllZeroable)
8437	Zeroable.setBit(i);
8438	continue;
8439	}
8440	}
8441
8442	return Zeroable;
8443	}
8444
8445	// The Shuffle result is as follow:
8446	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8447	// Each Zeroable's element correspond to a particular Mask's element.
8448	// As described in computeZeroableShuffleElements function.
8449	//
8450	// The function looks for a sub-mask that the nonzero elements are in
8451	// increasing order. If such sub-mask exist. The function returns true.
8452	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8453	ArrayRef<int> Mask, const EVT &VectorType,
8454	bool &IsZeroSideLeft) {
8455	int NextElement = -1;
8456	// Check if the Mask's nonzero elements are in increasing order.
8457	for (int i = 0, e = Mask.size(); i < e; i++) {
8458	// Checks if the mask's zeros elements are built from only zeros.
8459	assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ? static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8459, __PRETTY_FUNCTION__));
8460	if (Mask[i] < 0)
8461	return false;
8462	if (Zeroable[i])
8463	continue;
8464	// Find the lowest non zero element
8465	if (NextElement < 0) {
8466	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8467	IsZeroSideLeft = NextElement != 0;
8468	}
8469	// Exit if the mask's non zero elements are not in increasing order.
8470	if (NextElement != Mask[i])
8471	return false;
8472	NextElement++;
8473	}
8474	return true;
8475	}
8476
8477	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8478	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8479	ArrayRef<int> Mask, SDValue V1,
8480	SDValue V2,
8481	const APInt &Zeroable,
8482	const X86Subtarget &Subtarget,
8483	SelectionDAG &DAG) {
8484	int Size = Mask.size();
8485	int LaneSize = 128 / VT.getScalarSizeInBits();
8486	const int NumBytes = VT.getSizeInBits() / 8;
8487	const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8488
8489	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|(((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget .hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI () && VT.is512BitVector())) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget.hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI() && VT.is512BitVector())" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8491, __PRETTY_FUNCTION__))
8490	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|(((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget .hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI () && VT.is512BitVector())) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget.hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI() && VT.is512BitVector())" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8491, __PRETTY_FUNCTION__))
8491	(Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget .hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI () && VT.is512BitVector())) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) \|\| (Subtarget.hasAVX2() && VT.is256BitVector()) \|\| (Subtarget.hasBWI() && VT.is512BitVector())" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8491, __PRETTY_FUNCTION__));
8492
8493	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8494	// Sign bit set in i8 mask means zero element.
8495	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8496
8497	SDValue V;
8498	for (int i = 0; i < NumBytes; ++i) {
8499	int M = Mask[i / NumEltBytes];
8500	if (M < 0) {
8501	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8502	continue;
8503	}
8504	if (Zeroable[i / NumEltBytes]) {
8505	PSHUFBMask[i] = ZeroMask;
8506	continue;
8507	}
8508
8509	// We can only use a single input of V1 or V2.
8510	SDValue SrcV = (M >= Size ? V2 : V1);
8511	if (V && V != SrcV)
8512	return SDValue();
8513	V = SrcV;
8514	M %= Size;
8515
8516	// PSHUFB can't cross lanes, ensure this doesn't happen.
8517	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8518	return SDValue();
8519
8520	M = M % LaneSize;
8521	M = M * NumEltBytes + (i % NumEltBytes);
8522	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8523	}
8524	assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast <void> (0) : __assert_fail ("V && \"Failed to find a source input\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8524, __PRETTY_FUNCTION__));
8525
8526	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8527	return DAG.getBitcast(
8528	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8529	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8530	}
8531
8532	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8533	const X86Subtarget &Subtarget, SelectionDAG &DAG,
8534	const SDLoc &dl);
8535
8536	// X86 has dedicated shuffle that can be lowered to VEXPAND
8537	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8538	const APInt &Zeroable,
8539	ArrayRef<int> Mask, SDValue &V1,
8540	SDValue &V2, SelectionDAG &DAG,
8541	const X86Subtarget &Subtarget) {
8542	bool IsLeftZeroSide = true;
8543	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8544	IsLeftZeroSide))
8545	return SDValue();
8546	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8547	MVT IntegerType =
8548	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8549	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8550	unsigned NumElts = VT.getVectorNumElements();
8551	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&(((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) && "Unexpected number of vector elements" ) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) && \"Unexpected number of vector elements\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8552, __PRETTY_FUNCTION__))
8552	"Unexpected number of vector elements")(((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) && "Unexpected number of vector elements" ) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) && \"Unexpected number of vector elements\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8552, __PRETTY_FUNCTION__));
8553	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8554	Subtarget, DAG, DL);
8555	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8556	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8557	return DAG.getSelect(DL, VT, VMask,
8558	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8559	ZeroVector);
8560	}
8561
8562	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8563	unsigned &UnpackOpcode, bool IsUnary,
8564	ArrayRef<int> TargetMask, SDLoc &DL,
8565	SelectionDAG &DAG,
8566	const X86Subtarget &Subtarget) {
8567	int NumElts = VT.getVectorNumElements();
8568
8569	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8570	for (int i = 0; i != NumElts; i += 2) {
8571	int M1 = TargetMask[i + 0];
8572	int M2 = TargetMask[i + 1];
8573	Undef1 &= (SM_SentinelUndef == M1);
8574	Undef2 &= (SM_SentinelUndef == M2);
8575	Zero1 &= isUndefOrZero(M1);
8576	Zero2 &= isUndefOrZero(M2);
8577	}
8578	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&((!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) && "Zeroable shuffle detected") ? static_cast<void> (0) : __assert_fail ("!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) && \"Zeroable shuffle detected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8579, __PRETTY_FUNCTION__))
8579	"Zeroable shuffle detected")((!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) && "Zeroable shuffle detected") ? static_cast<void> (0) : __assert_fail ("!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) && \"Zeroable shuffle detected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8579, __PRETTY_FUNCTION__));
8580
8581	// Attempt to match the target mask against the unpack lo/hi mask patterns.
8582	SmallVector<int, 64> Unpckl, Unpckh;
8583	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8584	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8585	UnpackOpcode = X86ISD::UNPCKL;
8586	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8587	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8588	return true;
8589	}
8590
8591	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8592	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8593	UnpackOpcode = X86ISD::UNPCKH;
8594	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8595	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8596	return true;
8597	}
8598
8599	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8600	if (IsUnary && (Zero1 \|\| Zero2)) {
8601	// Don't bother if we can blend instead.
8602	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
8603	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8604	return false;
8605
8606	bool MatchLo = true, MatchHi = true;
8607	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
8608	int M = TargetMask[i];
8609
8610	// Ignore if the input is known to be zero or the index is undef.
8611	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
8612	(M == SM_SentinelUndef))
8613	continue;
8614
8615	MatchLo &= (M == Unpckl[i]);
8616	MatchHi &= (M == Unpckh[i]);
8617	}
8618
8619	if (MatchLo \|\| MatchHi) {
8620	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8621	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8622	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8623	return true;
8624	}
8625	}
8626
8627	// If a binary shuffle, commute and try again.
8628	if (!IsUnary) {
8629	ShuffleVectorSDNode::commuteMask(Unpckl);
8630	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8631	UnpackOpcode = X86ISD::UNPCKL;
8632	std::swap(V1, V2);
8633	return true;
8634	}
8635
8636	ShuffleVectorSDNode::commuteMask(Unpckh);
8637	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8638	UnpackOpcode = X86ISD::UNPCKH;
8639	std::swap(V1, V2);
8640	return true;
8641	}
8642	}
8643
8644	return false;
8645	}
8646
8647	// X86 has dedicated unpack instructions that can handle specific blend
8648	// operations: UNPCKH and UNPCKL.
8649	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8650	ArrayRef<int> Mask, SDValue V1,
8651	SDValue V2, SelectionDAG &DAG) {
8652	SmallVector<int, 8> Unpckl;
8653	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
8654	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8655	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8656
8657	SmallVector<int, 8> Unpckh;
8658	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
8659	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8660	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8661
8662	// Commute and try again.
8663	ShuffleVectorSDNode::commuteMask(Unpckl);
8664	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8665	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8666
8667	ShuffleVectorSDNode::commuteMask(Unpckh);
8668	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8669	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8670
8671	return SDValue();
8672	}
8673
8674	/// \brief Try to emit a bitmask instruction for a shuffle.
8675	///
8676	/// This handles cases where we can model a blend exactly as a bitmask due to
8677	/// one of the inputs being zeroable.
8678	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8679	SDValue V2, ArrayRef<int> Mask,
8680	const APInt &Zeroable,
8681	SelectionDAG &DAG) {
8682	assert(!VT.isFloatingPoint() && "Floating point types are not supported")((!VT.isFloatingPoint() && "Floating point types are not supported" ) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"Floating point types are not supported\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8682, __PRETTY_FUNCTION__));
8683	MVT EltVT = VT.getVectorElementType();
8684	SDValue Zero = DAG.getConstant(0, DL, EltVT);
8685	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8686	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8687	SDValue V;
8688	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8689	if (Zeroable[i])
8690	continue;
8691	if (Mask[i] % Size != i)
8692	return SDValue(); // Not a blend.
8693	if (!V)
8694	V = Mask[i] < Size ? V1 : V2;
8695	else if (V != (Mask[i] < Size ? V1 : V2))
8696	return SDValue(); // Can only let one input through the mask.
8697
8698	VMaskOps[i] = AllOnes;
8699	}
8700	if (!V)
8701	return SDValue(); // No non-zeroable elements!
8702
8703	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8704	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8705	}
8706
8707	/// \brief Try to emit a blend instruction for a shuffle using bit math.
8708	///
8709	/// This is used as a fallback approach when first class blend instructions are
8710	/// unavailable. Currently it is only suitable for integer vectors, but could
8711	/// be generalized for floating point vectors if desirable.
8712	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8713	SDValue V2, ArrayRef<int> Mask,
8714	SelectionDAG &DAG) {
8715	assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!" ) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8715, __PRETTY_FUNCTION__));
8716	MVT EltVT = VT.getVectorElementType();
8717	SDValue Zero = DAG.getConstant(0, DL, EltVT);
8718	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8719	SmallVector<SDValue, 16> MaskOps;
8720	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8721	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8722	return SDValue(); // Shuffled input!
8723	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8724	}
8725
8726	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8727	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8728	// We have to cast V2 around.
8729	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8730	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8731	DAG.getBitcast(MaskVT, V1Mask),
8732	DAG.getBitcast(MaskVT, V2)));
8733	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8734	}
8735
8736	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8737	SDValue PreservedSrc,
8738	const X86Subtarget &Subtarget,
8739	SelectionDAG &DAG);
8740
8741	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8742	MutableArrayRef<int> TargetMask,
8743	bool &ForceV1Zero, bool &ForceV2Zero,
8744	uint64_t &BlendMask) {
8745	bool V1IsZeroOrUndef =
8746	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
8747	bool V2IsZeroOrUndef =
8748	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());
8749
8750	BlendMask = 0;
8751	ForceV1Zero = false, ForceV2Zero = false;
8752	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask")((TargetMask.size() <= 64 && "Shuffle mask too big for blend mask" ) ? static_cast<void> (0) : __assert_fail ("TargetMask.size() <= 64 && \"Shuffle mask too big for blend mask\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8752, __PRETTY_FUNCTION__));
8753
8754	// Attempt to generate the binary blend mask. If an input is zero then
8755	// we can use any lane.
8756	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8757	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8758	int M = TargetMask[i];
8759	if (M == SM_SentinelUndef)
8760	continue;
8761	if (M == i)
8762	continue;
8763	if (M == i + Size) {
8764	BlendMask \|= 1ull << i;
8765	continue;
8766	}
8767	if (M == SM_SentinelZero) {
8768	if (V1IsZeroOrUndef) {
8769	ForceV1Zero = true;
8770	TargetMask[i] = i;
8771	continue;
8772	}
8773	if (V2IsZeroOrUndef) {
8774	ForceV2Zero = true;
8775	BlendMask \|= 1ull << i;
8776	TargetMask[i] = i + Size;
8777	continue;
8778	}
8779	}
8780	return false;
8781	}
8782	return true;
8783	}
8784
8785	uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8786	uint64_t ScaledMask = 0;
8787	for (int i = 0; i != Size; ++i)
8788	if (BlendMask & (1ull << i))
8789	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
8790	return ScaledMask;
8791	}
8792
8793	/// \brief Try to emit a blend instruction for a shuffle.
8794	///
8795	/// This doesn't do any checks for the availability of instructions for blending
8796	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8797	/// be matched in the backend with the type given. What it does check for is
8798	/// that the shuffle mask is a blend, or convertible into a blend with zero.
8799	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8800	SDValue V2, ArrayRef<int> Original,
8801	const APInt &Zeroable,
8802	const X86Subtarget &Subtarget,
8803	SelectionDAG &DAG) {
8804	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8805
8806	uint64_t BlendMask = 0;
8807	bool ForceV1Zero = false, ForceV2Zero = false;
8808	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8809	BlendMask))
8810	return SDValue();
8811
8812	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8813	if (ForceV1Zero)
8814	V1 = getZeroVector(VT, Subtarget, DAG, DL);
8815	if (ForceV2Zero)
8816	V2 = getZeroVector(VT, Subtarget, DAG, DL);
8817
8818	switch (VT.SimpleTy) {
8819	case MVT::v2f64:
8820	case MVT::v4f32:
8821	case MVT::v4f64:
8822	case MVT::v8f32:
8823	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8824	DAG.getConstant(BlendMask, DL, MVT::i8));
8825
8826	case MVT::v4i64:
8827	case MVT::v8i32:
8828	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8828, __PRETTY_FUNCTION__));
8829	LLVM_FALLTHROUGH[[clang::fallthrough]];
8830	case MVT::v2i64:
8831	case MVT::v4i32:
8832	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8833	// that instruction.
8834	if (Subtarget.hasAVX2()) {
8835	// Scale the blend by the number of 32-bit dwords per element.
8836	int Scale = VT.getScalarSizeInBits() / 32;
8837	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8838	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8839	V1 = DAG.getBitcast(BlendVT, V1);
8840	V2 = DAG.getBitcast(BlendVT, V2);
8841	return DAG.getBitcast(
8842	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8843	DAG.getConstant(BlendMask, DL, MVT::i8)));
8844	}
8845	LLVM_FALLTHROUGH[[clang::fallthrough]];
8846	case MVT::v8i16: {
8847	// For integer shuffles we need to expand the mask and cast the inputs to
8848	// v8i16s prior to blending.
8849	int Scale = 8 / VT.getVectorNumElements();
8850	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8851	V1 = DAG.getBitcast(MVT::v8i16, V1);
8852	V2 = DAG.getBitcast(MVT::v8i16, V2);
8853	return DAG.getBitcast(VT,
8854	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8855	DAG.getConstant(BlendMask, DL, MVT::i8)));
8856	}
8857
8858	case MVT::v16i16: {
8859	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8859, __PRETTY_FUNCTION__));
8860	SmallVector<int, 8> RepeatedMask;
8861	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8862	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8863	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8863, __PRETTY_FUNCTION__));
8864	BlendMask = 0;
8865	for (int i = 0; i < 8; ++i)
8866	if (RepeatedMask[i] >= 8)
8867	BlendMask \|= 1ull << i;
8868	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8869	DAG.getConstant(BlendMask, DL, MVT::i8));
8870	}
8871	LLVM_FALLTHROUGH[[clang::fallthrough]];
8872	}
8873	case MVT::v16i8:
8874	case MVT::v32i8: {
8875	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&(((VT.is128BitVector() \|\| Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8876, __PRETTY_FUNCTION__))
8876	"256-bit byte-blends require AVX2 support!")(((VT.is128BitVector() \|\| Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8876, __PRETTY_FUNCTION__));
8877
8878	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8879	MVT IntegerType =
8880	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8881	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8882	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8883	}
8884
8885	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8886	if (SDValue Masked =
8887	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8888	return Masked;
8889
8890	// Scale the blend by the number of bytes per element.
8891	int Scale = VT.getScalarSizeInBits() / 8;
8892
8893	// This form of blend is always done on bytes. Compute the byte vector
8894	// type.
8895	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8896
8897	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
8898	// mix of LLVM's code generator and the x86 backend. We tell the code
8899	// generator that boolean values in the elements of an x86 vector register
8900	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8901	// mapping a select to operand #1, and 'false' mapping to operand #2. The
8902	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8903	// of the element (the remaining are ignored) and 0 in that high bit would
8904	// mean operand #1 while 1 in the high bit would mean operand #2. So while
8905	// the LLVM model for boolean values in vector elements gets the relevant
8906	// bit set, it is set backwards and over constrained relative to x86's
8907	// actual model.
8908	SmallVector<SDValue, 32> VSELECTMask;
8909	for (int i = 0, Size = Mask.size(); i < Size; ++i)
8910	for (int j = 0; j < Scale; ++j)
8911	VSELECTMask.push_back(
8912	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8913	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8914	MVT::i8));
8915
8916	V1 = DAG.getBitcast(BlendVT, V1);
8917	V2 = DAG.getBitcast(BlendVT, V2);
8918	return DAG.getBitcast(
8919	VT,
8920	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8921	V1, V2));
8922	}
8923	case MVT::v16f32:
8924	case MVT::v8f64:
8925	case MVT::v8i64:
8926	case MVT::v16i32:
8927	case MVT::v32i16:
8928	case MVT::v64i8: {
8929	MVT IntegerType =
8930	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8931	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8932	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8933	}
8934	default:
8935	llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8935);
8936	}
8937	}
8938
8939	/// \brief Try to lower as a blend of elements from two inputs followed by
8940	/// a single-input permutation.
8941	///
8942	/// This matches the pattern where we can blend elements from two inputs and
8943	/// then reduce the shuffle to a single-input permutation.
8944	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8945	SDValue V1, SDValue V2,
8946	ArrayRef<int> Mask,
8947	SelectionDAG &DAG) {
8948	// We build up the blend mask while checking whether a blend is a viable way
8949	// to reduce the shuffle.
8950	SmallVector<int, 32> BlendMask(Mask.size(), -1);
8951	SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8952
8953	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8954	if (Mask[i] < 0)
8955	continue;
8956
8957	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds." ) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 8957, __PRETTY_FUNCTION__));
8958
8959	if (BlendMask[Mask[i] % Size] < 0)
8960	BlendMask[Mask[i] % Size] = Mask[i];
8961	else if (BlendMask[Mask[i] % Size] != Mask[i])
8962	return SDValue(); // Can't blend in the needed input!
8963
8964	PermuteMask[i] = Mask[i] % Size;
8965	}
8966
8967	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8968	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8969	}
8970
8971	/// \brief Generic routine to decompose a shuffle and blend into independent
8972	/// blends and permutes.
8973	///
8974	/// This matches the extremely common pattern for handling combined
8975	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8976	/// operations. It will try to pick the best arrangement of shuffles and
8977	/// blends.
8978	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8979	MVT VT, SDValue V1,
8980	SDValue V2,
8981	ArrayRef<int> Mask,
8982	SelectionDAG &DAG) {
8983	// Shuffle the input elements into the desired positions in V1 and V2 and
8984	// blend them together.
8985	SmallVector<int, 32> V1Mask(Mask.size(), -1);
8986	SmallVector<int, 32> V2Mask(Mask.size(), -1);
8987	SmallVector<int, 32> BlendMask(Mask.size(), -1);
8988	for (int i = 0, Size = Mask.size(); i < Size; ++i)
8989	if (Mask[i] >= 0 && Mask[i] < Size) {
8990	V1Mask[i] = Mask[i];
8991	BlendMask[i] = i;
8992	} else if (Mask[i] >= Size) {
8993	V2Mask[i] = Mask[i] - Size;
8994	BlendMask[i] = i + Size;
8995	}
8996
8997	// Try to lower with the simpler initial blend strategy unless one of the
8998	// input shuffles would be a no-op. We prefer to shuffle inputs as the
8999	// shuffle may be able to fold with a load or other benefit. However, when
9000	// we'll have to do 2x as many shuffles in order to achieve this, blending
9001	// first is a better strategy.
9002	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9003	if (SDValue BlendPerm =
9004	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9005	return BlendPerm;
9006
9007	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9008	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9009	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9010	}
9011
9012	/// \brief Try to lower a vector shuffle as a rotation.
9013	///
9014	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9015	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9016	ArrayRef<int> Mask) {
9017	int NumElts = Mask.size();
9018
9019	// We need to detect various ways of spelling a rotation:
9020	// [11, 12, 13, 14, 15, 0, 1, 2]
9021	// [-1, 12, 13, 14, -1, -1, 1, -1]
9022	// [-1, -1, -1, -1, -1, -1, 1, 2]
9023	// [ 3, 4, 5, 6, 7, 8, 9, 10]
9024	// [-1, 4, 5, 6, -1, -1, 9, -1]
9025	// [-1, 4, 5, 6, -1, -1, -1, -1]
9026	int Rotation = 0;
9027	SDValue Lo, Hi;
9028	for (int i = 0; i < NumElts; ++i) {
9029	int M = Mask[i];
9030	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2NumElts))) &&(((M == SM_SentinelUndef \|\| (0 <= M && M < (2NumElts ))) && "Unexpected mask index.") ? static_cast<void > (0) : __assert_fail ("(M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9031, __PRETTY_FUNCTION__))
9031	"Unexpected mask index.")(((M == SM_SentinelUndef \|\| (0 <= M && M < (2NumElts ))) && "Unexpected mask index.") ? static_cast<void > (0) : __assert_fail ("(M == SM_SentinelUndef \|\| (0 <= M && M < (2NumElts))) && \"Unexpected mask index.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9031, __PRETTY_FUNCTION__));
9032	if (M < 0)
9033	continue;
9034
9035	// Determine where a rotated vector would have started.
9036	int StartIdx = i - (M % NumElts);
9037	if (StartIdx == 0)
9038	// The identity rotation isn't interesting, stop.
9039	return -1;
9040
9041	// If we found the tail of a vector the rotation must be the missing
9042	// front. If we found the head of a vector, it must be how much of the
9043	// head.
9044	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9045
9046	if (Rotation == 0)
9047	Rotation = CandidateRotation;
9048	else if (Rotation != CandidateRotation)
9049	// The rotations don't match, so we can't match this mask.
9050	return -1;
9051
9052	// Compute which value this mask is pointing at.
9053	SDValue MaskV = M < NumElts ? V1 : V2;
9054
9055	// Compute which of the two target values this index should be assigned
9056	// to. This reflects whether the high elements are remaining or the low
9057	// elements are remaining.
9058	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9059
9060	// Either set up this value if we've not encountered it before, or check
9061	// that it remains consistent.
9062	if (!TargetV)
9063	TargetV = MaskV;
9064	else if (TargetV != MaskV)
9065	// This may be a rotation, but it pulls from the inputs in some
9066	// unsupported interleaving.
9067	return -1;
9068	}
9069
9070	// Check that we successfully analyzed the mask, and normalize the results.
9071	assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!" ) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9071, __PRETTY_FUNCTION__));
9072	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!")(((Lo \|\| Hi) && "Failed to find a rotated input vector!" ) ? static_cast<void> (0) : __assert_fail ("(Lo \|\| Hi) && \"Failed to find a rotated input vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9072, __PRETTY_FUNCTION__));
9073	if (!Lo)
9074	Lo = Hi;
9075	else if (!Hi)
9076	Hi = Lo;
9077
9078	V1 = Lo;
9079	V2 = Hi;
9080
9081	return Rotation;
9082	}
9083
9084	/// \brief Try to lower a vector shuffle as a byte rotation.
9085	///
9086	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9087	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9088	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9089	/// try to generically lower a vector shuffle through such an pattern. It
9090	/// does not check for the profitability of lowering either as PALIGNR or
9091	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9092	/// This matches shuffle vectors that look like:
9093	///
9094	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9095	///
9096	/// Essentially it concatenates V1 and V2, shifts right by some number of
9097	/// elements, and takes the low elements as the result. Note that while this is
9098	/// specified as a right shift because x86 is little-endian, it is a *left
9099	/// rotate* of the vector lanes.
9100	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9101	ArrayRef<int> Mask) {
9102	// Don't accept any shuffles with zero elements.
9103	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9104	return -1;
9105
9106	// PALIGNR works on 128-bit lanes.
9107	SmallVector<int, 16> RepeatedMask;
9108	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9109	return -1;
9110
9111	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9112	if (Rotation <= 0)
9113	return -1;
9114
9115	// PALIGNR rotates bytes, so we need to scale the
9116	// rotation based on how many bytes are in the vector lane.
9117	int NumElts = RepeatedMask.size();
9118	int Scale = 16 / NumElts;
9119	return Rotation * Scale;
9120	}
9121
9122	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9123	SDValue V1, SDValue V2,
9124	ArrayRef<int> Mask,
9125	const X86Subtarget &Subtarget,
9126	SelectionDAG &DAG) {
9127	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!" ) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9127, __PRETTY_FUNCTION__));
9128
9129	SDValue Lo = V1, Hi = V2;
9130	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9131	if (ByteRotation <= 0)
9132	return SDValue();
9133
9134	// Cast the inputs to i8 vector of correct length to match PALIGNR or
9135	// PSLLDQ/PSRLDQ.
9136	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9137	Lo = DAG.getBitcast(ByteVT, Lo);
9138	Hi = DAG.getBitcast(ByteVT, Hi);
9139
9140	// SSSE3 targets can use the palignr instruction.
9141	if (Subtarget.hasSSSE3()) {
9142	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&(((!VT.is512BitVector() \|\| Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions" ) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() \|\| Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9143, __PRETTY_FUNCTION__))
9143	"512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() \|\| Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions" ) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() \|\| Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9143, __PRETTY_FUNCTION__));
9144	return DAG.getBitcast(
9145	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9146	DAG.getConstant(ByteRotation, DL, MVT::i8)));
9147	}
9148
9149	assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9150, __PRETTY_FUNCTION__))
9150	"Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9150, __PRETTY_FUNCTION__));
9151	assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9152, __PRETTY_FUNCTION__))
9152	"Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9152, __PRETTY_FUNCTION__));
9153	assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!" ) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9154, __PRETTY_FUNCTION__))
9154	"SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!" ) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9154, __PRETTY_FUNCTION__));
9155
9156	// Default SSE2 implementation
9157	int LoByteShift = 16 - ByteRotation;
9158	int HiByteShift = ByteRotation;
9159
9160	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9161	DAG.getConstant(LoByteShift, DL, MVT::i8));
9162	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9163	DAG.getConstant(HiByteShift, DL, MVT::i8));
9164	return DAG.getBitcast(VT,
9165	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9166	}
9167
9168	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
9169	///
9170	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9171	/// rotation of the concatenation of two vectors; This routine will
9172	/// try to generically lower a vector shuffle through such an pattern.
9173	///
9174	/// Essentially it concatenates V1 and V2, shifts right by some number of
9175	/// elements, and takes the low elements as the result. Note that while this is
9176	/// specified as a right shift because x86 is little-endian, it is a *left
9177	/// rotate* of the vector lanes.
9178	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9179	SDValue V1, SDValue V2,
9180	ArrayRef<int> Mask,
9181	const X86Subtarget &Subtarget,
9182	SelectionDAG &DAG) {
9183	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT ::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9184, __PRETTY_FUNCTION__))
9184	"Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT ::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9184, __PRETTY_FUNCTION__));
9185
9186	// 128/256-bit vectors are only supported with VLX.
9187	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT .is256BitVector())) && "VLX required for 128/256-bit vectors" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9188, __PRETTY_FUNCTION__))
9188	&& "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT .is256BitVector())) && "VLX required for 128/256-bit vectors" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9188, __PRETTY_FUNCTION__));
9189
9190	SDValue Lo = V1, Hi = V2;
9191	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9192	if (Rotation <= 0)
9193	return SDValue();
9194
9195	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9196	DAG.getConstant(Rotation, DL, MVT::i8));
9197	}
9198
9199	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9200	///
9201	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9202	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9203	/// matches elements from one of the input vectors shuffled to the left or
9204	/// right with zeroable elements 'shifted in'. It handles both the strictly
9205	/// bit-wise element shifts and the byte shift across an entire 128-bit double
9206	/// quad word lane.
9207	///
9208	/// PSHL : (little-endian) left bit shift.
9209	/// [ zz, 0, zz, 2 ]
9210	/// [ -1, 4, zz, -1 ]
9211	/// PSRL : (little-endian) right bit shift.
9212	/// [ 1, zz, 3, zz]
9213	/// [ -1, -1, 7, zz]
9214	/// PSLLDQ : (little-endian) left byte shift
9215	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
9216	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
9217	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
9218	/// PSRLDQ : (little-endian) right byte shift
9219	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
9220	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
9221	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
9222	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9223	unsigned ScalarSizeInBits,
9224	ArrayRef<int> Mask, int MaskOffset,
9225	const APInt &Zeroable,
9226	const X86Subtarget &Subtarget) {
9227	int Size = Mask.size();
9228	unsigned SizeInBits = Size * ScalarSizeInBits;
9229
9230	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9231	for (int i = 0; i < Size; i += Scale)
9232	for (int j = 0; j < Shift; ++j)
9233	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9234	return false;
9235
9236	return true;
9237	};
9238
9239	auto MatchShift = [&](int Shift, int Scale, bool Left) {
9240	for (int i = 0; i != Size; i += Scale) {
9241	unsigned Pos = Left ? i + Shift : i;
9242	unsigned Low = Left ? i : i + Shift;
9243	unsigned Len = Scale - Shift;
9244	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9245	return -1;
9246	}
9247
9248	int ShiftEltBits = ScalarSizeInBits * Scale;
9249	bool ByteShift = ShiftEltBits > 64;
9250	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9251	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9252	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9253
9254	// Normalize the scale for byte shifts to still produce an i64 element
9255	// type.
9256	Scale = ByteShift ? Scale / 2 : Scale;
9257
9258	// We need to round trip through the appropriate type for the shift.
9259	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9260	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9261	: MVT::getVectorVT(ShiftSVT, Size / Scale);
9262	return (int)ShiftAmt;
9263	};
9264
9265	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9266	// keep doubling the size of the integer elements up to that. We can
9267	// then shift the elements of the integer vector by whole multiples of
9268	// their width within the elements of the larger integer vector. Test each
9269	// multiple to see if we can find a match with the moved element indices
9270	// and that the shifted in elements are all zeroable.
9271	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9272	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9273	for (int Shift = 1; Shift != Scale; ++Shift)
9274	for (bool Left : {true, false})
9275	if (CheckZeros(Shift, Scale, Left)) {
9276	int ShiftAmt = MatchShift(Shift, Scale, Left);
9277	if (0 < ShiftAmt)
9278	return ShiftAmt;
9279	}
9280
9281	// no match
9282	return -1;
9283	}
9284
9285	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9286	SDValue V2, ArrayRef<int> Mask,
9287	const APInt &Zeroable,
9288	const X86Subtarget &Subtarget,
9289	SelectionDAG &DAG) {
9290	int Size = Mask.size();
9291	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size" ) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9291, __PRETTY_FUNCTION__));
9292
9293	MVT ShiftVT;
9294	SDValue V = V1;
9295	unsigned Opcode;
9296
9297	// Try to match shuffle against V1 shift.
9298	int ShiftAmt = matchVectorShuffleAsShift(
9299	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9300
9301	// If V1 failed, try to match shuffle against V2 shift.
9302	if (ShiftAmt < 0) {
9303	ShiftAmt =
9304	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9305	Mask, Size, Zeroable, Subtarget);
9306	V = V2;
9307	}
9308
9309	if (ShiftAmt < 0)
9310	return SDValue();
9311
9312	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9313, __PRETTY_FUNCTION__))
9313	"Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9313, __PRETTY_FUNCTION__));
9314	V = DAG.getBitcast(ShiftVT, V);
9315	V = DAG.getNode(Opcode, DL, ShiftVT, V,
9316	DAG.getConstant(ShiftAmt, DL, MVT::i8));
9317	return DAG.getBitcast(VT, V);
9318	}
9319
9320	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9321	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9322	SDValue V2, ArrayRef<int> Mask,
9323	const APInt &Zeroable,
9324	SelectionDAG &DAG) {
9325	int Size = Mask.size();
9326	int HalfSize = Size / 2;
9327	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size" ) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9327, __PRETTY_FUNCTION__));
9328	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask" ) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9328, __PRETTY_FUNCTION__));
9329
9330	// Upper half must be undefined.
9331	if (!isUndefInRange(Mask, HalfSize, HalfSize))
9332	return SDValue();
9333
9334	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9335	// Remainder of lower half result is zero and upper half is all undef.
9336	auto LowerAsEXTRQ = [&]() {
9337	// Determine the extraction length from the part of the
9338	// lower half that isn't zeroable.
9339	int Len = HalfSize;
9340	for (; Len > 0; --Len)
9341	if (!Zeroable[Len - 1])
9342	break;
9343	assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast <void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9343, __PRETTY_FUNCTION__));
9344
9345	// Attempt to match first Len sequential elements from the lower half.
9346	SDValue Src;
9347	int Idx = -1;
9348	for (int i = 0; i != Len; ++i) {
9349	int M = Mask[i];
9350	if (M < 0)
9351	continue;
9352	SDValue &V = (M < Size ? V1 : V2);
9353	M = M % Size;
9354
9355	// The extracted elements must start at a valid index and all mask
9356	// elements must be in the lower half.
9357	if (i > M \|\| M >= HalfSize)
9358	return SDValue();
9359
9360	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
9361	Src = V;
9362	Idx = M - i;
9363	continue;
9364	}
9365	return SDValue();
9366	}
9367
9368	if (Idx < 0)
9369	return SDValue();
9370
9371	assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask" ) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9371, __PRETTY_FUNCTION__));
9372	int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9373	int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9374	return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9375	DAG.getConstant(BitLen, DL, MVT::i8),
9376	DAG.getConstant(BitIdx, DL, MVT::i8));
9377	};
9378
9379	if (SDValue ExtrQ = LowerAsEXTRQ())
9380	return ExtrQ;
9381
9382	// INSERTQ: Extract lowest Len elements from lower half of second source and
9383	// insert over first source, starting at Idx.
9384	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9385	auto LowerAsInsertQ = [&]() {
9386	for (int Idx = 0; Idx != HalfSize; ++Idx) {
9387	SDValue Base;
9388
9389	// Attempt to match first source from mask before insertion point.
9390	if (isUndefInRange(Mask, 0, Idx)) {
9391	/* EMPTY */
9392	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9393	Base = V1;
9394	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9395	Base = V2;
9396	} else {
9397	continue;
9398	}
9399
9400	// Extend the extraction length looking to match both the insertion of
9401	// the second source and the remaining elements of the first.
9402	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9403	SDValue Insert;
9404	int Len = Hi - Idx;
9405
9406	// Match insertion.
9407	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9408	Insert = V1;
9409	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9410	Insert = V2;
9411	} else {
9412	continue;
9413	}
9414
9415	// Match the remaining elements of the lower half.
9416	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9417	/* EMPTY */
9418	} else if ((!Base \|\| (Base == V1)) &&
9419	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9420	Base = V1;
9421	} else if ((!Base \|\| (Base == V2)) &&
9422	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9423	Size + Hi)) {
9424	Base = V2;
9425	} else {
9426	continue;
9427	}
9428
9429	// We may not have a base (first source) - this can safely be undefined.
9430	if (!Base)
9431	Base = DAG.getUNDEF(VT);
9432
9433	int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9434	int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9435	return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9436	DAG.getConstant(BitLen, DL, MVT::i8),
9437	DAG.getConstant(BitIdx, DL, MVT::i8));
9438	}
9439	}
9440
9441	return SDValue();
9442	};
9443
9444	if (SDValue InsertQ = LowerAsInsertQ())
9445	return InsertQ;
9446
9447	return SDValue();
9448	}
9449
9450	/// \brief Lower a vector shuffle as a zero or any extension.
9451	///
9452	/// Given a specific number of elements, element bit width, and extension
9453	/// stride, produce either a zero or any extension based on the available
9454	/// features of the subtarget. The extended elements are consecutive and
9455	/// begin and can start from an offsetted element index in the input; to
9456	/// avoid excess shuffling the offset must either being in the bottom lane
9457	/// or at the start of a higher lane. All extended elements must be from
9458	/// the same lane.
9459	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9460	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9461	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9462	assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast <void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9462, __PRETTY_FUNCTION__));
9463	int EltBits = VT.getScalarSizeInBits();
9464	int NumElements = VT.getVectorNumElements();
9465	int NumEltsPerLane = 128 / EltBits;
9466	int OffsetLane = Offset / NumEltsPerLane;
9467	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&(((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended.") ? static_cast <void> (0) : __assert_fail ("(EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9468, __PRETTY_FUNCTION__))
9468	"Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended.") ? static_cast <void> (0) : __assert_fail ("(EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9468, __PRETTY_FUNCTION__));
9469	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits." ) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9469, __PRETTY_FUNCTION__));
9470	assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive." ) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9470, __PRETTY_FUNCTION__));
9471	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0 ) && "Extension offset must be in the first lane or start an upper lane." ) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9472, __PRETTY_FUNCTION__))
9472	"Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0 ) && "Extension offset must be in the first lane or start an upper lane." ) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9472, __PRETTY_FUNCTION__));
9473
9474	// Check that an index is in same lane as the base offset.
9475	auto SafeOffset = [&](int Idx) {
9476	return OffsetLane == (Idx / NumEltsPerLane);
9477	};
9478
9479	// Shift along an input so that the offset base moves to the first element.
9480	auto ShuffleOffset = [&](SDValue V) {
9481	if (!Offset)
9482	return V;
9483
9484	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9485	for (int i = 0; i * Scale < NumElements; ++i) {
9486	int SrcIdx = i + Offset;
9487	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9488	}
9489	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9490	};
9491
9492	// Found a valid zext mask! Try various lowering strategies based on the
9493	// input type and available ISA extensions.
9494	if (Subtarget.hasSSE41()) {
9495	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9496	// PUNPCK will catch this in a later shuffle match.
9497	if (Offset && Scale == 2 && VT.is128BitVector())
9498	return SDValue();
9499	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9500	NumElements / Scale);
9501	InputV = ShuffleOffset(InputV);
9502	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9503	return DAG.getBitcast(VT, InputV);
9504	}
9505
9506	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9506, __PRETTY_FUNCTION__));
9507
9508	// For any extends we can cheat for larger element sizes and use shuffle
9509	// instructions that can fold with a load and/or copy.
9510	if (AnyExt && EltBits == 32) {
9511	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9512	-1};
9513	return DAG.getBitcast(
9514	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9515	DAG.getBitcast(MVT::v4i32, InputV),
9516	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9517	}
9518	if (AnyExt && EltBits == 16 && Scale > 2) {
9519	int PSHUFDMask[4] = {Offset / 2, -1,
9520	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9521	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9522	DAG.getBitcast(MVT::v4i32, InputV),
9523	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9524	int PSHUFWMask[4] = {1, -1, -1, -1};
9525	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9526	return DAG.getBitcast(
9527	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9528	DAG.getBitcast(MVT::v8i16, InputV),
9529	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9530	}
9531
9532	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9533	// to 64-bits.
9534	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9535	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!" ) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9535, __PRETTY_FUNCTION__));
9536	assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9536, __PRETTY_FUNCTION__));
9537
9538	int LoIdx = Offset * EltBits;
9539	SDValue Lo = DAG.getBitcast(
9540	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9541	DAG.getConstant(EltBits, DL, MVT::i8),
9542	DAG.getConstant(LoIdx, DL, MVT::i8)));
9543
9544	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
9545	!SafeOffset(Offset + 1))
9546	return DAG.getBitcast(VT, Lo);
9547
9548	int HiIdx = (Offset + 1) * EltBits;
9549	SDValue Hi = DAG.getBitcast(
9550	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9551	DAG.getConstant(EltBits, DL, MVT::i8),
9552	DAG.getConstant(HiIdx, DL, MVT::i8)));
9553	return DAG.getBitcast(VT,
9554	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9555	}
9556
9557	// If this would require more than 2 unpack instructions to expand, use
9558	// pshufb when available. We can only use more than 2 unpack instructions
9559	// when zero extending i8 elements which also makes it easier to use pshufb.
9560	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9561	assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!" ) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9561, __PRETTY_FUNCTION__));
9562	SDValue PSHUFBMask[16];
9563	for (int i = 0; i < 16; ++i) {
9564	int Idx = Offset + (i / Scale);
9565	PSHUFBMask[i] = DAG.getConstant(
9566	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9567	}
9568	InputV = DAG.getBitcast(MVT::v16i8, InputV);
9569	return DAG.getBitcast(
9570	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9571	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9572	}
9573
9574	// If we are extending from an offset, ensure we start on a boundary that
9575	// we can unpack from.
9576	int AlignToUnpack = Offset % (NumElements / Scale);
9577	if (AlignToUnpack) {
9578	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9579	for (int i = AlignToUnpack; i < NumElements; ++i)
9580	ShMask[i - AlignToUnpack] = i;
9581	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9582	Offset -= AlignToUnpack;
9583	}
9584
9585	// Otherwise emit a sequence of unpacks.
9586	do {
9587	unsigned UnpackLoHi = X86ISD::UNPCKL;
9588	if (Offset >= (NumElements / 2)) {
9589	UnpackLoHi = X86ISD::UNPCKH;
9590	Offset -= (NumElements / 2);
9591	}
9592
9593	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9594	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9595	: getZeroVector(InputVT, Subtarget, DAG, DL);
9596	InputV = DAG.getBitcast(InputVT, InputV);
9597	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9598	Scale /= 2;
9599	EltBits *= 2;
9600	NumElements /= 2;
9601	} while (Scale > 1);
9602	return DAG.getBitcast(VT, InputV);
9603	}
9604
9605	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9606	///
9607	/// This routine will try to do everything in its power to cleverly lower
9608	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
9609	/// check for the profitability of this lowering, it tries to aggressively
9610	/// match this pattern. It will use all of the micro-architectural details it
9611	/// can to emit an efficient lowering. It handles both blends with all-zero
9612	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9613	/// masking out later).
9614	///
9615	/// The reason we have dedicated lowering for zext-style shuffles is that they
9616	/// are both incredibly common and often quite performance sensitive.
9617	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9618	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9619	const APInt &Zeroable, const X86Subtarget &Subtarget,
9620	SelectionDAG &DAG) {
9621	int Bits = VT.getSizeInBits();
9622	int NumLanes = Bits / 128;
9623	int NumElements = VT.getVectorNumElements();
9624	int NumEltsPerLane = NumElements / NumLanes;
9625	assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9626, __PRETTY_FUNCTION__))
9626	"Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9626, __PRETTY_FUNCTION__));
9627	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size" ) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9627, __PRETTY_FUNCTION__));
9628
9629	// Define a helper function to check a particular ext-scale and lower to it if
9630	// valid.
9631	auto Lower = [&](int Scale) -> SDValue {
9632	SDValue InputV;
9633	bool AnyExt = true;
9634	int Offset = 0;
9635	int Matches = 0;
9636	for (int i = 0; i < NumElements; ++i) {
9637	int M = Mask[i];
9638	if (M < 0)
9639	continue; // Valid anywhere but doesn't tell us anything.
9640	if (i % Scale != 0) {
9641	// Each of the extended elements need to be zeroable.
9642	if (!Zeroable[i])
9643	return SDValue();
9644
9645	// We no longer are in the anyext case.
9646	AnyExt = false;
9647	continue;
9648	}
9649
9650	// Each of the base elements needs to be consecutive indices into the
9651	// same input vector.
9652	SDValue V = M < NumElements ? V1 : V2;
9653	M = M % NumElements;
9654	if (!InputV) {
9655	InputV = V;
9656	Offset = M - (i / Scale);
9657	} else if (InputV != V)
9658	return SDValue(); // Flip-flopping inputs.
9659
9660	// Offset must start in the lowest 128-bit lane or at the start of an
9661	// upper lane.
9662	// FIXME: Is it ever worth allowing a negative base offset?
9663	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
9664	(Offset % NumEltsPerLane) == 0))
9665	return SDValue();
9666
9667	// If we are offsetting, all referenced entries must come from the same
9668	// lane.
9669	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9670	return SDValue();
9671
9672	if ((M % NumElements) != (Offset + (i / Scale)))
9673	return SDValue(); // Non-consecutive strided elements.
9674	Matches++;
9675	}
9676
9677	// If we fail to find an input, we have a zero-shuffle which should always
9678	// have already been handled.
9679	// FIXME: Maybe handle this here in case during blending we end up with one?
9680	if (!InputV)
9681	return SDValue();
9682
9683	// If we are offsetting, don't extend if we only match a single input, we
9684	// can always do better by using a basic PSHUF or PUNPCK.
9685	if (Offset != 0 && Matches < 2)
9686	return SDValue();
9687
9688	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9689	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9690	};
9691
9692	// The widest scale possible for extending is to a 64-bit integer.
9693	assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!" ) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9694, __PRETTY_FUNCTION__))
9694	"The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!" ) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9694, __PRETTY_FUNCTION__));
9695	int NumExtElements = Bits / 64;
9696
9697	// Each iteration, try extending the elements half as much, but into twice as
9698	// many elements.
9699	for (; NumExtElements < NumElements; NumExtElements *= 2) {
9700	assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size." ) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9701, __PRETTY_FUNCTION__))
9701	"The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size." ) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9701, __PRETTY_FUNCTION__));
9702	if (SDValue V = Lower(NumElements / NumExtElements))
9703	return V;
9704	}
9705
9706	// General extends failed, but 128-bit vectors may be able to use MOVQ.
9707	if (Bits != 128)
9708	return SDValue();
9709
9710	// Returns one of the source operands if the shuffle can be reduced to a
9711	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9712	auto CanZExtLowHalf = [&]() {
9713	for (int i = NumElements / 2; i != NumElements; ++i)
9714	if (!Zeroable[i])
9715	return SDValue();
9716	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9717	return V1;
9718	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9719	return V2;
9720	return SDValue();
9721	};
9722
9723	if (SDValue V = CanZExtLowHalf()) {
9724	V = DAG.getBitcast(MVT::v2i64, V);
9725	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9726	return DAG.getBitcast(VT, V);
9727	}
9728
9729	// No viable ext lowering found.
9730	return SDValue();
9731	}
9732
9733	/// \brief Try to get a scalar value for a specific element of a vector.
9734	///
9735	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9736	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9737	SelectionDAG &DAG) {
9738	MVT VT = V.getSimpleValueType();
9739	MVT EltVT = VT.getVectorElementType();
9740	V = peekThroughBitcasts(V);
9741
9742	// If the bitcasts shift the element size, we can't extract an equivalent
9743	// element from it.
9744	MVT NewVT = V.getSimpleValueType();
9745	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9746	return SDValue();
9747
9748	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
9749	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9750	// Ensure the scalar operand is the same size as the destination.
9751	// FIXME: Add support for scalar truncation where possible.
9752	SDValue S = V.getOperand(Idx);
9753	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9754	return DAG.getBitcast(EltVT, S);
9755	}
9756
9757	return SDValue();
9758	}
9759
9760	/// \brief Helper to test for a load that can be folded with x86 shuffles.
9761	///
9762	/// This is particularly important because the set of instructions varies
9763	/// significantly based on whether the operand is a load or not.
9764	static bool isShuffleFoldableLoad(SDValue V) {
9765	V = peekThroughBitcasts(V);
9766	return ISD::isNON_EXTLoad(V.getNode());
9767	}
9768
9769	/// \brief Try to lower insertion of a single element into a zero vector.
9770	///
9771	/// This is a common pattern that we have especially efficient patterns to lower
9772	/// across all subtarget feature sets.
9773	static SDValue lowerVectorShuffleAsElementInsertion(
9774	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9775	const APInt &Zeroable, const X86Subtarget &Subtarget,
9776	SelectionDAG &DAG) {
9777	MVT ExtVT = VT;
9778	MVT EltVT = VT.getVectorElementType();
9779
9780	int V2Index =
9781	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9782	Mask.begin();
9783	bool IsV1Zeroable = true;
9784	for (int i = 0, Size = Mask.size(); i < Size; ++i)
9785	if (i != V2Index && !Zeroable[i]) {
9786	IsV1Zeroable = false;
9787	break;
9788	}
9789
9790	// Check for a single input from a SCALAR_TO_VECTOR node.
9791	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9792	// all the smarts here sunk into that routine. However, the current
9793	// lowering of BUILD_VECTOR makes that nearly impossible until the old
9794	// vector shuffle lowering is dead.
9795	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9796	DAG);
9797	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9798	// We need to zext the scalar if it is smaller than an i32.
9799	V2S = DAG.getBitcast(EltVT, V2S);
9800	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
9801	// Using zext to expand a narrow element won't work for non-zero
9802	// insertions.
9803	if (!IsV1Zeroable)
9804	return SDValue();
9805
9806	// Zero-extend directly to i32.
9807	ExtVT = MVT::v4i32;
9808	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9809	}
9810	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9811	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
9812	EltVT == MVT::i16) {
9813	// Either not inserting from the low element of the input or the input
9814	// element size is too small to use VZEXT_MOVL to clear the high bits.
9815	return SDValue();
9816	}
9817
9818	if (!IsV1Zeroable) {
9819	// If V1 can't be treated as a zero vector we have fewer options to lower
9820	// this. We can't support integer vectors or non-zero targets cheaply, and
9821	// the V1 elements can't be permuted in any way.
9822	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!" ) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9822, __PRETTY_FUNCTION__));
9823	if (!VT.isFloatingPoint() \|\| V2Index != 0)
9824	return SDValue();
9825	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9826	V1Mask[V2Index] = -1;
9827	if (!isNoopShuffleMask(V1Mask))
9828	return SDValue();
9829	// This is essentially a special case blend operation, but if we have
9830	// general purpose blend operations, they are always faster. Bail and let
9831	// the rest of the lowering handle these as blends.
9832	if (Subtarget.hasSSE41())
9833	return SDValue();
9834
9835	// Otherwise, use MOVSD or MOVSS.
9836	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&(((EltVT == MVT::f32 \|\| EltVT == MVT::f64) && "Only two types of floating point element types to handle!" ) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 \|\| EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9837, __PRETTY_FUNCTION__))
9837	"Only two types of floating point element types to handle!")(((EltVT == MVT::f32 \|\| EltVT == MVT::f64) && "Only two types of floating point element types to handle!" ) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 \|\| EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9837, __PRETTY_FUNCTION__));
9838	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9839	ExtVT, V1, V2);
9840	}
9841
9842	// This lowering only works for the low element with floating point vectors.
9843	if (VT.isFloatingPoint() && V2Index != 0)
9844	return SDValue();
9845
9846	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9847	if (ExtVT != VT)
9848	V2 = DAG.getBitcast(VT, V2);
9849
9850	if (V2Index != 0) {
9851	// If we have 4 or fewer lanes we can cheaply shuffle the element into
9852	// the desired position. Otherwise it is more efficient to do a vector
9853	// shift left. We know that we can do a vector shift left because all
9854	// the inputs are zero.
9855	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
9856	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9857	V2Shuffle[V2Index] = 0;
9858	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9859	} else {
9860	V2 = DAG.getBitcast(MVT::v16i8, V2);
9861	V2 = DAG.getNode(
9862	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9863	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9864	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9865	DAG.getDataLayout(), VT)));
9866	V2 = DAG.getBitcast(VT, V2);
9867	}
9868	}
9869	return V2;
9870	}
9871
9872	/// Try to lower broadcast of a single - truncated - integer element,
9873	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9874	///
9875	/// This assumes we have AVX2.
9876	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9877	SDValue V0, int BroadcastIdx,
9878	const X86Subtarget &Subtarget,
9879	SelectionDAG &DAG) {
9880	assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9881, __PRETTY_FUNCTION__))
9881	"We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9881, __PRETTY_FUNCTION__));
9882
9883	EVT EltVT = VT.getVectorElementType();
9884	EVT V0VT = V0.getValueType();
9885
9886	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!" ) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9886, __PRETTY_FUNCTION__));
9887	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!" ) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9887, __PRETTY_FUNCTION__));
9888
9889	EVT V0EltVT = V0VT.getVectorElementType();
9890	if (!V0EltVT.isInteger())
9891	return SDValue();
9892
9893	const unsigned EltSize = EltVT.getSizeInBits();
9894	const unsigned V0EltSize = V0EltVT.getSizeInBits();
9895
9896	// This is only a truncation if the original element type is larger.
9897	if (V0EltSize <= EltSize)
9898	return SDValue();
9899
9900	assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!" ) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9901, __PRETTY_FUNCTION__))
9901	"Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!" ) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9901, __PRETTY_FUNCTION__));
9902
9903	const unsigned V0Opc = V0.getOpcode();
9904	const unsigned Scale = V0EltSize / EltSize;
9905	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9906
9907	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
9908	V0Opc != ISD::BUILD_VECTOR)
9909	return SDValue();
9910
9911	SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9912
9913	// If we're extracting non-least-significant bits, shift so we can truncate.
9914	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
9915	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9916	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9917	if (const int OffsetIdx = BroadcastIdx % Scale)
9918	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9919	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9920
9921	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9922	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9923	}
9924
9925	/// \brief Try to lower broadcast of a single element.
9926	///
9927	/// For convenience, this code also bundles all of the subtarget feature set
9928	/// filtering. While a little annoying to re-dispatch on type here, there isn't
9929	/// a convenient way to factor it out.
9930	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9931	SDValue V1, SDValue V2,
9932	ArrayRef<int> Mask,
9933	const X86Subtarget &Subtarget,
9934	SelectionDAG &DAG) {
9935	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
9936	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
9937	(Subtarget.hasAVX2() && VT.isInteger())))
9938	return SDValue();
9939
9940	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9941	// we can only broadcast from a register with AVX2.
9942	unsigned NumElts = Mask.size();
9943	unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9944	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();
9945
9946	// Check that the mask is a broadcast.
9947	int BroadcastIdx = -1;
9948	for (int i = 0; i != (int)NumElts; ++i) {
9949	SmallVector<int, 8> BroadcastMask(NumElts, i);
9950	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9951	BroadcastIdx = i;
9952	break;
9953	}
9954	}
9955
9956	if (BroadcastIdx < 0)
9957	return SDValue();
9958	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? static_cast <void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9960, __PRETTY_FUNCTION__))
9959	"a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? static_cast <void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9960, __PRETTY_FUNCTION__))
9960	"comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? static_cast <void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 9960, __PRETTY_FUNCTION__));
9961
9962	// Go up the chain of (vector) values to find a scalar load that we can
9963	// combine with the broadcast.
9964	SDValue V = V1;
9965	for (;;) {
9966	switch (V.getOpcode()) {
9967	case ISD::BITCAST: {
9968	SDValue VSrc = V.getOperand(0);
9969	MVT SrcVT = VSrc.getSimpleValueType();
9970	if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9971	break;
9972	V = VSrc;
9973	continue;
9974	}
9975	case ISD::CONCAT_VECTORS: {
9976	int OperandSize = Mask.size() / V.getNumOperands();
9977	V = V.getOperand(BroadcastIdx / OperandSize);
9978	BroadcastIdx %= OperandSize;
9979	continue;
9980	}
9981	case ISD::INSERT_SUBVECTOR: {
9982	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9983	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9984	if (!ConstantIdx)
9985	break;
9986
9987	int BeginIdx = (int)ConstantIdx->getZExtValue();
9988	int EndIdx =
9989	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9990	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9991	BroadcastIdx -= BeginIdx;
9992	V = VInner;
9993	} else {
9994	V = VOuter;
9995	}
9996	continue;
9997	}
9998	}
9999	break;
10000	}
10001
10002	// Check if this is a broadcast of a scalar. We special case lowering
10003	// for scalars so that we can more effectively fold with loads.
10004	// First, look through bitcast: if the original value has a larger element
10005	// type than the shuffle, the broadcast element is in essence truncated.
10006	// Make that explicit to ease folding.
10007	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10008	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10009	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10010	return TruncBroadcast;
10011
10012	MVT BroadcastVT = VT;
10013
10014	// Peek through any bitcast (only useful for loads).
10015	SDValue BC = peekThroughBitcasts(V);
10016
10017	// Also check the simpler case, where we can directly reuse the scalar.
10018	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
10019	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10020	V = V.getOperand(BroadcastIdx);
10021
10022	// If we can't broadcast from a register, check that the input is a load.
10023	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10024	return SDValue();
10025	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10026	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
10027	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10028	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10029	Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10030	}
10031
10032	// If we are broadcasting a load that is only used by the shuffle
10033	// then we can reduce the vector load to the broadcasted scalar load.
10034	LoadSDNode *Ld = cast<LoadSDNode>(BC);
10035	SDValue BaseAddr = Ld->getOperand(1);
10036	EVT SVT = BroadcastVT.getScalarType();
10037	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10038	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10039	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10040	DAG.getMachineFunction().getMachineMemOperand(
10041	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10042	DAG.makeEquivalentMemoryOrdering(Ld, V);
10043	} else if (!BroadcastFromReg) {
10044	// We can't broadcast from a vector register.
10045	return SDValue();
10046	} else if (BroadcastIdx != 0) {
10047	// We can only broadcast from the zero-element of a vector register,
10048	// but it can be advantageous to broadcast from the zero-element of a
10049	// subvector.
10050	if (!VT.is256BitVector() && !VT.is512BitVector())
10051	return SDValue();
10052
10053	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10054	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
10055	return SDValue();
10056
10057	// Only broadcast the zero-element of a 128-bit subvector.
10058	unsigned EltSize = VT.getScalarSizeInBits();
10059	if (((BroadcastIdx * EltSize) % 128) != 0)
10060	return SDValue();
10061
10062	// The shuffle input might have been a bitcast we looked through; look at
10063	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10064	// later bitcast it to BroadcastVT.
10065	MVT SrcVT = V.getSimpleValueType();
10066	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&((SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits () && "Unexpected vector element size") ? static_cast <void> (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10067, __PRETTY_FUNCTION__))
10067	"Unexpected vector element size")((SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits () && "Unexpected vector element size") ? static_cast <void> (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10067, __PRETTY_FUNCTION__));
10068	assert((SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) &&(((SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) && "Unexpected vector size") ? static_cast<void> (0) : __assert_fail ("(SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) && \"Unexpected vector size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10069, __PRETTY_FUNCTION__))
10069	"Unexpected vector size")(((SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) && "Unexpected vector size") ? static_cast<void> (0) : __assert_fail ("(SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) && \"Unexpected vector size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10069, __PRETTY_FUNCTION__));
10070
10071	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10072	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10073	DAG.getIntPtrConstant(BroadcastIdx, DL));
10074	}
10075
10076	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10077	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10078	DAG.getBitcast(MVT::f64, V));
10079
10080	// Bitcast back to the same scalar type as BroadcastVT.
10081	MVT SrcVT = V.getSimpleValueType();
10082	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10083	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&((SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits () && "Unexpected vector element size") ? static_cast <void> (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10084, __PRETTY_FUNCTION__))
10084	"Unexpected vector element size")((SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits () && "Unexpected vector element size") ? static_cast <void> (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10084, __PRETTY_FUNCTION__));
10085	if (SrcVT.isVector()) {
10086	unsigned NumSrcElts = SrcVT.getVectorNumElements();
10087	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10088	} else {
10089	SrcVT = BroadcastVT.getScalarType();
10090	}
10091	V = DAG.getBitcast(SrcVT, V);
10092	}
10093
10094	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
10095	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10096	V = DAG.getBitcast(MVT::f64, V);
10097	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10098	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10099	}
10100
10101	// We only support broadcasting from 128-bit vectors to minimize the
10102	// number of patterns we need to deal with in isel. So extract down to
10103	// 128-bits.
10104	if (SrcVT.getSizeInBits() > 128)
10105	V = extract128BitVector(V, 0, DAG, DL);
10106
10107	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10108	}
10109
10110	// Check for whether we can use INSERTPS to perform the shuffle. We only use
10111	// INSERTPS when the V1 elements are already in the correct locations
10112	// because otherwise we can just always use two SHUFPS instructions which
10113	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10114	// perform INSERTPS if a single V1 element is out of place and all V2
10115	// elements are zeroable.
10116	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10117	unsigned &InsertPSMask,
10118	const APInt &Zeroable,
10119	ArrayRef<int> Mask,
10120	SelectionDAG &DAG) {
10121	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10121, __PRETTY_FUNCTION__));
10122	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10122, __PRETTY_FUNCTION__));
10123	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10123, __PRETTY_FUNCTION__));
10124
10125	// Attempt to match INSERTPS with one element from VA or VB being
10126	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10127	// are updated.
10128	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10129	ArrayRef<int> CandidateMask) {
10130	unsigned ZMask = 0;
10131	int VADstIndex = -1;
10132	int VBDstIndex = -1;
10133	bool VAUsedInPlace = false;
10134
10135	for (int i = 0; i < 4; ++i) {
10136	// Synthesize a zero mask from the zeroable elements (includes undefs).
10137	if (Zeroable[i]) {
10138	ZMask \|= 1 << i;
10139	continue;
10140	}
10141
10142	// Flag if we use any VA inputs in place.
10143	if (i == CandidateMask[i]) {
10144	VAUsedInPlace = true;
10145	continue;
10146	}
10147
10148	// We can only insert a single non-zeroable element.
10149	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
10150	return false;
10151
10152	if (CandidateMask[i] < 4) {
10153	// VA input out of place for insertion.
10154	VADstIndex = i;
10155	} else {
10156	// VB input for insertion.
10157	VBDstIndex = i;
10158	}
10159	}
10160
10161	// Don't bother if we have no (non-zeroable) element for insertion.
10162	if (VADstIndex < 0 && VBDstIndex < 0)
10163	return false;
10164
10165	// Determine element insertion src/dst indices. The src index is from the
10166	// start of the inserted vector, not the start of the concatenated vector.
10167	unsigned VBSrcIndex = 0;
10168	if (VADstIndex >= 0) {
10169	// If we have a VA input out of place, we use VA as the V2 element
10170	// insertion and don't use the original V2 at all.
10171	VBSrcIndex = CandidateMask[VADstIndex];
10172	VBDstIndex = VADstIndex;
10173	VB = VA;
10174	} else {
10175	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10176	}
10177
10178	// If no V1 inputs are used in place, then the result is created only from
10179	// the zero mask and the V2 insertion - so remove V1 dependency.
10180	if (!VAUsedInPlace)
10181	VA = DAG.getUNDEF(MVT::v4f32);
10182
10183	// Update V1, V2 and InsertPSMask accordingly.
10184	V1 = VA;
10185	V2 = VB;
10186
10187	// Insert the V2 element into the desired position.
10188	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
10189	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!" ) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10189, __PRETTY_FUNCTION__));
10190	return true;
10191	};
10192
10193	if (matchAsInsertPS(V1, V2, Mask))
10194	return true;
10195
10196	// Commute and try again.
10197	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10198	ShuffleVectorSDNode::commuteMask(CommutedMask);
10199	if (matchAsInsertPS(V2, V1, CommutedMask))
10200	return true;
10201
10202	return false;
10203	}
10204
10205	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10206	SDValue V2, ArrayRef<int> Mask,
10207	const APInt &Zeroable,
10208	SelectionDAG &DAG) {
10209	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10209, __PRETTY_FUNCTION__));
10210	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10210, __PRETTY_FUNCTION__));
10211
10212	// Attempt to match the insertps pattern.
10213	unsigned InsertPSMask;
10214	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10215	return SDValue();
10216
10217	// Insert the V2 element into the desired position.
10218	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10219	DAG.getConstant(InsertPSMask, DL, MVT::i8));
10220	}
10221
10222	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
10223	/// UNPCK instruction.
10224	///
10225	/// This specifically targets cases where we end up with alternating between
10226	/// the two inputs, and so can permute them into something that feeds a single
10227	/// UNPCK instruction. Note that this routine only targets integer vectors
10228	/// because for floating point vectors we have a generalized SHUFPS lowering
10229	/// strategy that handles everything that doesn't exactly match an unpack,
10230	/// making this clever lowering unnecessary.
10231	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10232	SDValue V1, SDValue V2,
10233	ArrayRef<int> Mask,
10234	SelectionDAG &DAG) {
10235	assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors." ) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10236, __PRETTY_FUNCTION__))
10236	"This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors." ) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10236, __PRETTY_FUNCTION__));
10237	assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10238, __PRETTY_FUNCTION__))
10238	"This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10238, __PRETTY_FUNCTION__));
10239	assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs." ) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10240, __PRETTY_FUNCTION__))
10240	"This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs." ) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10240, __PRETTY_FUNCTION__));
10241	assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid." ) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10241, __PRETTY_FUNCTION__));
10242
10243	int Size = Mask.size();
10244
10245	int NumLoInputs =
10246	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10247	int NumHiInputs =
10248	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10249
10250	bool UnpackLo = NumLoInputs >= NumHiInputs;
10251
10252	auto TryUnpack = [&](int ScalarSize, int Scale) {
10253	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10254	SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10255
10256	for (int i = 0; i < Size; ++i) {
10257	if (Mask[i] < 0)
10258	continue;
10259
10260	// Each element of the unpack contains Scale elements from this mask.
10261	int UnpackIdx = i / Scale;
10262
10263	// We only handle the case where V1 feeds the first slots of the unpack.
10264	// We rely on canonicalization to ensure this is the case.
10265	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10266	return SDValue();
10267
10268	// Setup the mask for this input. The indexing is tricky as we have to
10269	// handle the unpack stride.
10270	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10271	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10272	Mask[i] % Size;
10273	}
10274
10275	// If we will have to shuffle both inputs to use the unpack, check whether
10276	// we can just unpack first and shuffle the result. If so, skip this unpack.
10277	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10278	!isNoopShuffleMask(V2Mask))
10279	return SDValue();
10280
10281	// Shuffle the inputs into place.
10282	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10283	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10284
10285	// Cast the inputs to the type we will use to unpack them.
10286	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10287	V1 = DAG.getBitcast(UnpackVT, V1);
10288	V2 = DAG.getBitcast(UnpackVT, V2);
10289
10290	// Unpack the inputs and cast the result back to the desired type.
10291	return DAG.getBitcast(
10292	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10293	UnpackVT, V1, V2));
10294	};
10295
10296	// We try each unpack from the largest to the smallest to try and find one
10297	// that fits this mask.
10298	int OrigScalarSize = VT.getScalarSizeInBits();
10299	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10300	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10301	return Unpack;
10302
10303	// If none of the unpack-rooted lowerings worked (or were profitable) try an
10304	// initial unpack.
10305	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
10306	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&(((NumLoInputs > 0 \|\| NumHiInputs > 0) && "We have to have some inputs!" ) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 \|\| NumHiInputs > 0) && \"We have to have some inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10307, __PRETTY_FUNCTION__))
10307	"We have to have some inputs!")(((NumLoInputs > 0 \|\| NumHiInputs > 0) && "We have to have some inputs!" ) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 \|\| NumHiInputs > 0) && \"We have to have some inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10307, __PRETTY_FUNCTION__));
10308	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10309
10310	// FIXME: We could consider the total complexity of the permute of each
10311	// possible unpacking. Or at the least we should consider how many
10312	// half-crossings are created.
10313	// FIXME: We could consider commuting the unpacks.
10314
10315	SmallVector<int, 32> PermMask((unsigned)Size, -1);
10316	for (int i = 0; i < Size; ++i) {
10317	if (Mask[i] < 0)
10318	continue;
10319
10320	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!" ) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10320, __PRETTY_FUNCTION__));
10321
10322	PermMask[i] =
10323	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10324	}
10325	return DAG.getVectorShuffle(
10326	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10327	DL, VT, V1, V2),
10328	DAG.getUNDEF(VT), PermMask);
10329	}
10330
10331	return SDValue();
10332	}
10333
10334	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10335	///
10336	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
10337	/// support for floating point shuffles but not integer shuffles. These
10338	/// instructions will incur a domain crossing penalty on some chips though so
10339	/// it is better to avoid lowering through this for integer vectors where
10340	/// possible.
10341	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10342	const APInt &Zeroable,
10343	SDValue V1, SDValue V2,
10344	const X86Subtarget &Subtarget,
10345	SelectionDAG &DAG) {
10346	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10346, __PRETTY_FUNCTION__));
10347	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10347, __PRETTY_FUNCTION__));
10348	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10348, __PRETTY_FUNCTION__));
10349
10350	if (V2.isUndef()) {
10351	// Check for being able to broadcast a single element.
10352	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10353	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10354	return Broadcast;
10355
10356	// Straight shuffle of a single input vector. Simulate this by using the
10357	// single input as both of the "inputs" to this instruction..
10358	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);
10359
10360	if (Subtarget.hasAVX()) {
10361	// If we have AVX, we can use VPERMILPS which will allow folding a load
10362	// into the shuffle.
10363	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10364	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10365	}
10366
10367	return DAG.getNode(
10368	X86ISD::SHUFP, DL, MVT::v2f64,
10369	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10370	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10371	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10372	}
10373	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!")((Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!" ) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && Mask[0] < 2 && \"Non-canonicalized blend!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10373, __PRETTY_FUNCTION__));
10374	assert(Mask[1] >= 2 && "Non-canonicalized blend!")((Mask[1] >= 2 && "Non-canonicalized blend!") ? static_cast <void> (0) : __assert_fail ("Mask[1] >= 2 && \"Non-canonicalized blend!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10374, __PRETTY_FUNCTION__));
10375
10376	// If we have a single input, insert that into V1 if we can do so cheaply.
10377	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10378	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10379	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10380	return Insertion;
10381	// Try inverting the insertion since for v2 masks it is easy to do and we
10382	// can't reliably sort the mask one way or the other.
10383	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10384	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10385	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10386	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10387	return Insertion;
10388	}
10389
10390	// Try to use one of the special instruction patterns to handle two common
10391	// blend patterns if a zero-blend above didn't work.
10392	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
10393	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10394	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10395	// We can either use a special instruction to load over the low double or
10396	// to move just the low double.
10397	return DAG.getNode(
10398	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10399	DL, MVT::v2f64, V2,
10400	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10401
10402	if (Subtarget.hasSSE41())
10403	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10404	Zeroable, Subtarget, DAG))
10405	return Blend;
10406
10407	// Use dedicated unpack instructions for masks that match their pattern.
10408	if (SDValue V =
10409	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10410	return V;
10411
10412	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
10413	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10414	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10415	}
10416
10417	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10418	///
10419	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10420	/// the integer unit to minimize domain crossing penalties. However, for blends
10421	/// it falls back to the floating point shuffle operation with appropriate bit
10422	/// casting.
10423	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10424	const APInt &Zeroable,
10425	SDValue V1, SDValue V2,
10426	const X86Subtarget &Subtarget,
10427	SelectionDAG &DAG) {
10428	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10428, __PRETTY_FUNCTION__));
10429	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10429, __PRETTY_FUNCTION__));
10430	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10430, __PRETTY_FUNCTION__));
10431
10432	if (V2.isUndef()) {
10433	// Check for being able to broadcast a single element.
10434	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10435	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10436	return Broadcast;
10437
10438	// Straight shuffle of a single input vector. For everything from SSE2
10439	// onward this has a single fast instruction with no scary immediates.
10440	// We have to map the mask as it is actually a v4i32 shuffle instruction.
10441	V1 = DAG.getBitcast(MVT::v4i32, V1);
10442	int WidenedMask[4] = {
10443	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10444	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10445	return DAG.getBitcast(
10446	MVT::v2i64,
10447	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10448	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10449	}
10450	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!" ) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10450, __PRETTY_FUNCTION__));
10451	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!" ) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10451, __PRETTY_FUNCTION__));
10452	assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input." ) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10452, __PRETTY_FUNCTION__));
10453	assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input." ) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10453, __PRETTY_FUNCTION__));
10454
10455	// If we have a blend of two same-type PACKUS operations and the blend aligns
10456	// with the low and high halves, we can just merge the PACKUS operations.
10457	// This is particularly important as it lets us merge shuffles that this
10458	// routine itself creates.
10459	auto GetPackNode = [](SDValue V) {
10460	V = peekThroughBitcasts(V);
10461	return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10462	};
10463	if (SDValue V1Pack = GetPackNode(V1))
10464	if (SDValue V2Pack = GetPackNode(V2)) {
10465	EVT PackVT = V1Pack.getValueType();
10466	if (PackVT == V2Pack.getValueType())
10467	return DAG.getBitcast(MVT::v2i64,
10468	DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10469	Mask[0] == 0 ? V1Pack.getOperand(0)
10470	: V1Pack.getOperand(1),
10471	Mask[1] == 2 ? V2Pack.getOperand(0)
10472	: V2Pack.getOperand(1)));
10473	}
10474
10475	// Try to use shift instructions.
10476	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10477	Zeroable, Subtarget, DAG))
10478	return Shift;
10479
10480	// When loading a scalar and then shuffling it into a vector we can often do
10481	// the insertion cheaply.
10482	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10483	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10484	return Insertion;
10485	// Try inverting the insertion since for v2 masks it is easy to do and we
10486	// can't reliably sort the mask one way or the other.
10487	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10488	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10489	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10490	return Insertion;
10491
10492	// We have different paths for blend lowering, but they all must use the
10493	// exact same predicate.
10494	bool IsBlendSupported = Subtarget.hasSSE41();
10495	if (IsBlendSupported)
10496	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10497	Zeroable, Subtarget, DAG))
10498	return Blend;
10499
10500	// Use dedicated unpack instructions for masks that match their pattern.
10501	if (SDValue V =
10502	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10503	return V;
10504
10505	// Try to use byte rotation instructions.
10506	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10507	if (Subtarget.hasSSSE3())
10508	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10509	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10510	return Rotate;
10511
10512	// If we have direct support for blends, we should lower by decomposing into
10513	// a permute. That will be faster than the domain cross.
10514	if (IsBlendSupported)
10515	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10516	Mask, DAG);
10517
10518	// We implement this with SHUFPD which is pretty lame because it will likely
10519	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10520	// However, all the alternatives are still more cycles and newer chips don't
10521	// have this problem. It would be really nice if x86 had better shuffles here.
10522	V1 = DAG.getBitcast(MVT::v2f64, V1);
10523	V2 = DAG.getBitcast(MVT::v2f64, V2);
10524	return DAG.getBitcast(MVT::v2i64,
10525	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10526	}
10527
10528	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
10529	///
10530	/// This is used to disable more specialized lowerings when the shufps lowering
10531	/// will happen to be efficient.
10532	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10533	// This routine only handles 128-bit shufps.
10534	assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast <void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10534, __PRETTY_FUNCTION__));
10535	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10535, __PRETTY_FUNCTION__));
10536	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10536, __PRETTY_FUNCTION__));
10537	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10537, __PRETTY_FUNCTION__));
10538	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!" ) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10538, __PRETTY_FUNCTION__));
10539
10540	// To lower with a single SHUFPS we need to have the low half and high half
10541	// each requiring a single input.
10542	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10543	return false;
10544	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10545	return false;
10546
10547	return true;
10548	}
10549
10550	/// \brief Lower a vector shuffle using the SHUFPS instruction.
10551	///
10552	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10553	/// It makes no assumptions about whether this is the best lowering, it simply
10554	/// uses it.
10555	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10556	ArrayRef<int> Mask, SDValue V1,
10557	SDValue V2, SelectionDAG &DAG) {
10558	SDValue LowV = V1, HighV = V2;
10559	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10560
10561	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10562
10563	if (NumV2Elements == 1) {
10564	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10565
10566	// Compute the index adjacent to V2Index and in the same half by toggling
10567	// the low bit.
10568	int V2AdjIndex = V2Index ^ 1;
10569
10570	if (Mask[V2AdjIndex] < 0) {
10571	// Handles all the cases where we have a single V2 element and an undef.
10572	// This will only ever happen in the high lanes because we commute the
10573	// vector otherwise.
10574	if (V2Index < 2)
10575	std::swap(LowV, HighV);
10576	NewMask[V2Index] -= 4;
10577	} else {
10578	// Handle the case where the V2 element ends up adjacent to a V1 element.
10579	// To make this work, blend them together as the first step.
10580	int V1Index = V2AdjIndex;
10581	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10582	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10583	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10584
10585	// Now proceed to reconstruct the final blend as we have the necessary
10586	// high or low half formed.
10587	if (V2Index < 2) {
10588	LowV = V2;
10589	HighV = V1;
10590	} else {
10591	HighV = V2;
10592	}
10593	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10594	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10595	}
10596	} else if (NumV2Elements == 2) {
10597	if (Mask[0] < 4 && Mask[1] < 4) {
10598	// Handle the easy case where we have V1 in the low lanes and V2 in the
10599	// high lanes.
10600	NewMask[2] -= 4;
10601	NewMask[3] -= 4;
10602	} else if (Mask[2] < 4 && Mask[3] < 4) {
10603	// We also handle the reversed case because this utility may get called
10604	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
10605	// arrange things in the right direction.
10606	NewMask[0] -= 4;
10607	NewMask[1] -= 4;
10608	HighV = V1;
10609	LowV = V2;
10610	} else {
10611	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
10612	// trying to place elements directly, just blend them and set up the final
10613	// shuffle to place them.
10614
10615	// The first two blend mask elements are for V1, the second two are for
10616	// V2.
10617	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10618	Mask[2] < 4 ? Mask[2] : Mask[3],
10619	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10620	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10621	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10622	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10623
10624	// Now we do a normal shuffle of V1 by giving V1 as both operands to
10625	// a blend.
10626	LowV = HighV = V1;
10627	NewMask[0] = Mask[0] < 4 ? 0 : 2;
10628	NewMask[1] = Mask[0] < 4 ? 2 : 0;
10629	NewMask[2] = Mask[2] < 4 ? 1 : 3;
10630	NewMask[3] = Mask[2] < 4 ? 3 : 1;
10631	}
10632	}
10633	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10634	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10635	}
10636
10637	/// \brief Lower 4-lane 32-bit floating point shuffles.
10638	///
10639	/// Uses instructions exclusively from the floating point unit to minimize
10640	/// domain crossing penalties, as these are sufficient to implement all v4f32
10641	/// shuffles.
10642	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10643	const APInt &Zeroable,
10644	SDValue V1, SDValue V2,
10645	const X86Subtarget &Subtarget,
10646	SelectionDAG &DAG) {
10647	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10647, __PRETTY_FUNCTION__));
10648	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10648, __PRETTY_FUNCTION__));
10649	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10649, __PRETTY_FUNCTION__));
10650
10651	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10652
10653	if (NumV2Elements == 0) {
10654	// Check for being able to broadcast a single element.
10655	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10656	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10657	return Broadcast;
10658
10659	// Use even/odd duplicate instructions for masks that match their pattern.
10660	if (Subtarget.hasSSE3()) {
10661	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10662	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10663	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10664	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10665	}
10666
10667	if (Subtarget.hasAVX()) {
10668	// If we have AVX, we can use VPERMILPS which will allow folding a load
10669	// into the shuffle.
10670	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10671	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10672	}
10673
10674	// Otherwise, use a straight shuffle of a single input vector. We pass the
10675	// input vector to both operands to simulate this with a SHUFPS.
10676	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10677	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10678	}
10679
10680	// There are special ways we can lower some single-element blends. However, we
10681	// have custom ways we can lower more complex single-element blends below that
10682	// we defer to if both this and BLENDPS fail to match, so restrict this to
10683	// when the V2 input is targeting element 0 of the mask -- that is the fast
10684	// case here.
10685	if (NumV2Elements == 1 && Mask[0] >= 4)
10686	if (SDValue V = lowerVectorShuffleAsElementInsertion(
10687	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10688	return V;
10689
10690	if (Subtarget.hasSSE41()) {
10691	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10692	Zeroable, Subtarget, DAG))
10693	return Blend;
10694
10695	// Use INSERTPS if we can complete the shuffle efficiently.
10696	if (SDValue V =
10697	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10698	return V;
10699
10700	if (!isSingleSHUFPSMask(Mask))
10701	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10702	DL, MVT::v4f32, V1, V2, Mask, DAG))
10703	return BlendPerm;
10704	}
10705
10706	// Use low/high mov instructions.
10707	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10708	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10709	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10710	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10711
10712	// Use dedicated unpack instructions for masks that match their pattern.
10713	if (SDValue V =
10714	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10715	return V;
10716
10717	// Otherwise fall back to a SHUFPS lowering strategy.
10718	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10719	}
10720
10721	/// \brief Lower 4-lane i32 vector shuffles.
10722	///
10723	/// We try to handle these with integer-domain shuffles where we can, but for
10724	/// blends we use the floating point domain blend instructions.
10725	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10726	const APInt &Zeroable,
10727	SDValue V1, SDValue V2,
10728	const X86Subtarget &Subtarget,
10729	SelectionDAG &DAG) {
10730	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10730, __PRETTY_FUNCTION__));
10731	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10731, __PRETTY_FUNCTION__));
10732	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10732, __PRETTY_FUNCTION__));
10733
10734	// Whenever we can lower this as a zext, that instruction is strictly faster
10735	// than any alternative. It also allows us to fold memory operands into the
10736	// shuffle in many cases.
10737	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10738	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10739	return ZExt;
10740
10741	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10742
10743	if (NumV2Elements == 0) {
10744	// Check for being able to broadcast a single element.
10745	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10746	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10747	return Broadcast;
10748
10749	// Straight shuffle of a single input vector. For everything from SSE2
10750	// onward this has a single fast instruction with no scary immediates.
10751	// We coerce the shuffle pattern to be compatible with UNPCK instructions
10752	// but we aren't actually going to use the UNPCK instruction because doing
10753	// so prevents folding a load into this instruction or making a copy.
10754	const int UnpackLoMask[] = {0, 0, 1, 1};
10755	const int UnpackHiMask[] = {2, 2, 3, 3};
10756	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10757	Mask = UnpackLoMask;
10758	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10759	Mask = UnpackHiMask;
10760
10761	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10762	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10763	}
10764
10765	// Try to use shift instructions.
10766	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10767	Zeroable, Subtarget, DAG))
10768	return Shift;
10769
10770	// There are special ways we can lower some single-element blends.
10771	if (NumV2Elements == 1)
10772	if (SDValue V = lowerVectorShuffleAsElementInsertion(
10773	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10774	return V;
10775
10776	// We have different paths for blend lowering, but they all must use the
10777	// exact same predicate.
10778	bool IsBlendSupported = Subtarget.hasSSE41();
10779	if (IsBlendSupported)
10780	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10781	Zeroable, Subtarget, DAG))
10782	return Blend;
10783
10784	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10785	Zeroable, DAG))
10786	return Masked;
10787
10788	// Use dedicated unpack instructions for masks that match their pattern.
10789	if (SDValue V =
10790	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10791	return V;
10792
10793	// Try to use byte rotation instructions.
10794	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10795	if (Subtarget.hasSSSE3())
10796	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10797	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10798	return Rotate;
10799
10800	// Assume that a single SHUFPS is faster than an alternative sequence of
10801	// multiple instructions (even if the CPU has a domain penalty).
10802	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
10803	if (!isSingleSHUFPSMask(Mask)) {
10804	// If we have direct support for blends, we should lower by decomposing into
10805	// a permute. That will be faster than the domain cross.
10806	if (IsBlendSupported)
10807	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10808	Mask, DAG);
10809
10810	// Try to lower by permuting the inputs into an unpack instruction.
10811	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10812	DL, MVT::v4i32, V1, V2, Mask, DAG))
10813	return Unpack;
10814	}
10815
10816	// We implement this with SHUFPS because it can blend from two vectors.
10817	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10818	// up the inputs, bypassing domain shift penalties that we would incur if we
10819	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10820	// relevant.
10821	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10822	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10823	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10824	return DAG.getBitcast(MVT::v4i32, ShufPS);
10825	}
10826
10827	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10828	/// shuffle lowering, and the most complex part.
10829	///
10830	/// The lowering strategy is to try to form pairs of input lanes which are
10831	/// targeted at the same half of the final vector, and then use a dword shuffle
10832	/// to place them onto the right half, and finally unpack the paired lanes into
10833	/// their final position.
10834	///
10835	/// The exact breakdown of how to form these dword pairs and align them on the
10836	/// correct sides is really tricky. See the comments within the function for
10837	/// more of the details.
10838	///
10839	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10840	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
10841	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10842	/// vector, form the analogous 128-bit 8-element Mask.
10843	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10844	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10845	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10846	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10846, __PRETTY_FUNCTION__));
10847	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10848
10849	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10849, __PRETTY_FUNCTION__));
10850	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10851	MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10852
10853	SmallVector<int, 4> LoInputs;
10854	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10855	std::sort(LoInputs.begin(), LoInputs.end());
10856	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10857	SmallVector<int, 4> HiInputs;
10858	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10859	std::sort(HiInputs.begin(), HiInputs.end());
10860	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10861	int NumLToL =
10862	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10863	int NumHToL = LoInputs.size() - NumLToL;
10864	int NumLToH =
10865	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10866	int NumHToH = HiInputs.size() - NumLToH;
10867	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10868	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10869	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10870	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10871
10872	// If we are splatting two values from one half - one to each half, then
10873	// we can shuffle that half so each is splatted to a dword, then splat those
10874	// to their respective halves.
10875	auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10876	int DOffset) {
10877	int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10878	int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10879	V = DAG.getNode(ShufWOp, DL, VT, V,
10880	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10881	V = DAG.getBitcast(PSHUFDVT, V);
10882	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10883	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10884	return DAG.getBitcast(VT, V);
10885	};
10886
10887	if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10888	return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10889	if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10890	return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10891
10892	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10893	// such inputs we can swap two of the dwords across the half mark and end up
10894	// with <=2 inputs to each half in each half. Once there, we can fall through
10895	// to the generic code below. For example:
10896	//
10897	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10898	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10899	//
10900	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10901	// and an existing 2-into-2 on the other half. In this case we may have to
10902	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10903	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10904	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
10905	// because any other situation (including a 3-into-1 or 1-into-3 in the other
10906	// half than the one we target for fixing) will be fixed when we re-enter this
10907	// path. We will also combine away any sequence of PSHUFD instructions that
10908	// result into a single instruction. Here is an example of the tricky case:
10909	//
10910	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10911	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10912	//
10913	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10914	//
10915	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10916	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10917	//
10918	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10919	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10920	//
10921	// The result is fine to be handled by the generic logic.
10922	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10923	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10924	int AOffset, int BOffset) {
10925	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half." ) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10926, __PRETTY_FUNCTION__))
10926	"Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half." ) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10926, __PRETTY_FUNCTION__));
10927	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half." ) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10928, __PRETTY_FUNCTION__))
10928	"Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half." ) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10928, __PRETTY_FUNCTION__));
10929	assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)." ) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10930, __PRETTY_FUNCTION__))
10930	"Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)." ) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10930, __PRETTY_FUNCTION__));
10931
10932	bool ThreeAInputs = AToAInputs.size() == 3;
10933
10934	// Compute the index of dword with only one word among the three inputs in
10935	// a half by taking the sum of the half with three inputs and subtracting
10936	// the sum of the actual three inputs. The difference is the remaining
10937	// slot.
10938	int ADWord, BDWord;
10939	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10940	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10941	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10942	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10943	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10944	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10945	int TripleNonInputIdx =
10946	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10947	TripleDWord = TripleNonInputIdx / 2;
10948
10949	// We use xor with one to compute the adjacent DWord to whichever one the
10950	// OneInput is in.
10951	OneInputDWord = (OneInput / 2) ^ 1;
10952
10953	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10954	// and BToA inputs. If there is also such a problem with the BToB and AToB
10955	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10956	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10957	// is essential that we don't create a 3<-1 as then we might oscillate.
10958	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10959	// Compute how many inputs will be flipped by swapping these DWords. We
10960	// need
10961	// to balance this to ensure we don't form a 3-1 shuffle in the other
10962	// half.
10963	int NumFlippedAToBInputs =
10964	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10965	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10966	int NumFlippedBToBInputs =
10967	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10968	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10969	if ((NumFlippedAToBInputs == 1 &&
10970	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
10971	(NumFlippedBToBInputs == 1 &&
10972	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
10973	// We choose whether to fix the A half or B half based on whether that
10974	// half has zero flipped inputs. At zero, we may not be able to fix it
10975	// with that half. We also bias towards fixing the B half because that
10976	// will more commonly be the high half, and we have to bias one way.
10977	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10978	ArrayRef<int> Inputs) {
10979	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10980	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10981	// Determine whether the free index is in the flipped dword or the
10982	// unflipped dword based on where the pinned index is. We use this bit
10983	// in an xor to conditionally select the adjacent dword.
10984	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10985	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10986	if (IsFixIdxInput == IsFixFreeIdxInput)
10987	FixFreeIdx += 1;
10988	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10989	assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!" ) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10990, __PRETTY_FUNCTION__))
10990	"We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!" ) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 10990, __PRETTY_FUNCTION__));
10991	int PSHUFHalfMask[] = {0, 1, 2, 3};
10992	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10993	V = DAG.getNode(
10994	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10995	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
10996	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10997
10998	for (int &M : Mask)
10999	if (M >= 0 && M == FixIdx)
11000	M = FixFreeIdx;
11001	else if (M >= 0 && M == FixFreeIdx)
11002	M = FixIdx;
11003	};
11004	if (NumFlippedBToBInputs != 0) {
11005	int BPinnedIdx =
11006	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11007	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11008	} else {
11009	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!" ) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11009, __PRETTY_FUNCTION__));
11010	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11011	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11012	}
11013	}
11014	}
11015
11016	int PSHUFDMask[] = {0, 1, 2, 3};
11017	PSHUFDMask[ADWord] = BDWord;
11018	PSHUFDMask[BDWord] = ADWord;
11019	V = DAG.getBitcast(
11020	VT,
11021	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11022	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11023
11024	// Adjust the mask to match the new locations of A and B.
11025	for (int &M : Mask)
11026	if (M >= 0 && M/2 == ADWord)
11027	M = 2 * BDWord + M % 2;
11028	else if (M >= 0 && M/2 == BDWord)
11029	M = 2 * ADWord + M % 2;
11030
11031	// Recurse back into this routine to re-compute state now that this isn't
11032	// a 3 and 1 problem.
11033	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11034	DAG);
11035	};
11036	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
11037	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11038	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
11039	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11040
11041	// At this point there are at most two inputs to the low and high halves from
11042	// each half. That means the inputs can always be grouped into dwords and
11043	// those dwords can then be moved to the correct half with a dword shuffle.
11044	// We use at most one low and one high word shuffle to collect these paired
11045	// inputs into dwords, and finally a dword shuffle to place them.
11046	int PSHUFLMask[4] = {-1, -1, -1, -1};
11047	int PSHUFHMask[4] = {-1, -1, -1, -1};
11048	int PSHUFDMask[4] = {-1, -1, -1, -1};
11049
11050	// First fix the masks for all the inputs that are staying in their
11051	// original halves. This will then dictate the targets of the cross-half
11052	// shuffles.
11053	auto fixInPlaceInputs =
11054	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11055	MutableArrayRef<int> SourceHalfMask,
11056	MutableArrayRef<int> HalfMask, int HalfOffset) {
11057	if (InPlaceInputs.empty())
11058	return;
11059	if (InPlaceInputs.size() == 1) {
11060	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11061	InPlaceInputs[0] - HalfOffset;
11062	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11063	return;
11064	}
11065	if (IncomingInputs.empty()) {
11066	// Just fix all of the in place inputs.
11067	for (int Input : InPlaceInputs) {
11068	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11069	PSHUFDMask[Input / 2] = Input / 2;
11070	}
11071	return;
11072	}
11073
11074	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!" ) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11074, __PRETTY_FUNCTION__));
11075	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11076	InPlaceInputs[0] - HalfOffset;
11077	// Put the second input next to the first so that they are packed into
11078	// a dword. We find the adjacent index by toggling the low bit.
11079	int AdjIndex = InPlaceInputs[0] ^ 1;
11080	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11081	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11082	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11083	};
11084	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11085	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11086
11087	// Now gather the cross-half inputs and place them into a free dword of
11088	// their target half.
11089	// FIXME: This operation could almost certainly be simplified dramatically to
11090	// look more like the 3-1 fixing operation.
11091	auto moveInputsToRightHalf = [&PSHUFDMask](
11092	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11093	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11094	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11095	int DestOffset) {
11096	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11097	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11098	};
11099	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11100	int Word) {
11101	int LowWord = Word & ~1;
11102	int HighWord = Word \| 1;
11103	return isWordClobbered(SourceHalfMask, LowWord) \|\|
11104	isWordClobbered(SourceHalfMask, HighWord);
11105	};
11106
11107	if (IncomingInputs.empty())
11108	return;
11109
11110	if (ExistingInputs.empty()) {
11111	// Map any dwords with inputs from them into the right half.
11112	for (int Input : IncomingInputs) {
11113	// If the source half mask maps over the inputs, turn those into
11114	// swaps and use the swapped lane.
11115	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11116	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11117	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11118	Input - SourceOffset;
11119	// We have to swap the uses in our half mask in one sweep.
11120	for (int &M : HalfMask)
11121	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11122	M = Input;
11123	else if (M == Input)
11124	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11125	} else {
11126	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11128, __PRETTY_FUNCTION__))
11127	Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11128, __PRETTY_FUNCTION__))
11128	"Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11128, __PRETTY_FUNCTION__));
11129	}
11130	// Note that this correctly re-maps both when we do a swap and when
11131	// we observe the other side of the swap above. We rely on that to
11132	// avoid swapping the members of the input list directly.
11133	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11134	}
11135
11136	// Map the input's dword into the correct half.
11137	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11138	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11139	else
11140	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!") ? static_cast <void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11142, __PRETTY_FUNCTION__))
11141	Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!") ? static_cast <void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11142, __PRETTY_FUNCTION__))
11142	"Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!") ? static_cast <void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11142, __PRETTY_FUNCTION__));
11143	}
11144
11145	// And just directly shift any other-half mask elements to be same-half
11146	// as we will have mirrored the dword containing the element into the
11147	// same position within that half.
11148	for (int &M : HalfMask)
11149	if (M >= SourceOffset && M < SourceOffset + 4) {
11150	M = M - SourceOffset + DestOffset;
11151	assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ? static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11151, __PRETTY_FUNCTION__));
11152	}
11153	return;
11154	}
11155
11156	// Ensure we have the input in a viable dword of its current half. This
11157	// is particularly tricky because the original position may be clobbered
11158	// by inputs being moved and staying in that half.
11159	if (IncomingInputs.size() == 1) {
11160	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11161	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11162	SourceOffset;
11163	SourceHalfMask[InputFixed - SourceOffset] =
11164	IncomingInputs[0] - SourceOffset;
11165	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11166	InputFixed);
11167	IncomingInputs[0] = InputFixed;
11168	}
11169	} else if (IncomingInputs.size() == 2) {
11170	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
11171	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11172	// We have two non-adjacent or clobbered inputs we need to extract from
11173	// the source half. To do this, we need to map them into some adjacent
11174	// dword slot in the source mask.
11175	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11176	IncomingInputs[1] - SourceOffset};
11177
11178	// If there is a free slot in the source half mask adjacent to one of
11179	// the inputs, place the other input in it. We use (Index XOR 1) to
11180	// compute an adjacent index.
11181	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11182	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11183	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11184	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11185	InputsFixed[1] = InputsFixed[0] ^ 1;
11186	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11187	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11188	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11189	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11190	InputsFixed[0] = InputsFixed[1] ^ 1;
11191	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11192	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11193	// The two inputs are in the same DWord but it is clobbered and the
11194	// adjacent DWord isn't used at all. Move both inputs to the free
11195	// slot.
11196	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11197	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11198	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11199	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11200	} else {
11201	// The only way we hit this point is if there is no clobbering
11202	// (because there are no off-half inputs to this half) and there is no
11203	// free slot adjacent to one of the inputs. In this case, we have to
11204	// swap an input with a non-input.
11205	for (int i = 0; i < 4; ++i)
11206	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) && "We can't handle any clobbers here!") ? static_cast<void> (0) : __assert_fail ("(SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11207, __PRETTY_FUNCTION__))
11207	"We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) && "We can't handle any clobbers here!") ? static_cast<void> (0) : __assert_fail ("(SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11207, __PRETTY_FUNCTION__));
11208	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!" ) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11209, __PRETTY_FUNCTION__))
11209	"Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!" ) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11209, __PRETTY_FUNCTION__));
11210
11211	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11212	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11213
11214	// We also have to update the final source mask in this case because
11215	// it may need to undo the above swap.
11216	for (int &M : FinalSourceHalfMask)
11217	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11218	M = InputsFixed[1] + SourceOffset;
11219	else if (M == InputsFixed[1] + SourceOffset)
11220	M = (InputsFixed[0] ^ 1) + SourceOffset;
11221
11222	InputsFixed[1] = InputsFixed[0] ^ 1;
11223	}
11224
11225	// Point everything at the fixed inputs.
11226	for (int &M : HalfMask)
11227	if (M == IncomingInputs[0])
11228	M = InputsFixed[0] + SourceOffset;
11229	else if (M == IncomingInputs[1])
11230	M = InputsFixed[1] + SourceOffset;
11231
11232	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11233	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11234	}
11235	} else {
11236	llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11236);
11237	}
11238
11239	// Now hoist the DWord down to the right half.
11240	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11241	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ? static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11241, __PRETTY_FUNCTION__));
11242	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11243	for (int &M : HalfMask)
11244	for (int Input : IncomingInputs)
11245	if (M == Input)
11246	M = FreeDWord * 2 + Input % 2;
11247	};
11248	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11249	/SourceOffset/ 4, /DestOffset/ 0);
11250	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11251	/SourceOffset/ 0, /DestOffset/ 4);
11252
11253	// Now enact all the shuffles we've computed to move the inputs into their
11254	// target half.
11255	if (!isNoopShuffleMask(PSHUFLMask))
11256	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11257	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11258	if (!isNoopShuffleMask(PSHUFHMask))
11259	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11260	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11261	if (!isNoopShuffleMask(PSHUFDMask))
11262	V = DAG.getBitcast(
11263	VT,
11264	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11265	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11266
11267	// At this point, each half should contain all its inputs, and we can then
11268	// just shuffle them into their final position.
11269	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!") ? static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11270, __PRETTY_FUNCTION__))
11270	"Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!") ? static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11270, __PRETTY_FUNCTION__));
11271	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!" ) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11272, __PRETTY_FUNCTION__))
11272	"Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!" ) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11272, __PRETTY_FUNCTION__));
11273
11274	// Do a half shuffle for the low mask.
11275	if (!isNoopShuffleMask(LoMask))
11276	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11277	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11278
11279	// Do a half shuffle with the high mask after shifting its values down.
11280	for (int &M : HiMask)
11281	if (M >= 0)
11282	M -= 4;
11283	if (!isNoopShuffleMask(HiMask))
11284	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11285	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11286
11287	return V;
11288	}
11289
11290	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11291	/// blend if only one input is used.
11292	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11293	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11294	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11295	bool &V2InUse) {
11296	SDValue V1Mask[16];
11297	SDValue V2Mask[16];
11298	V1InUse = false;
11299	V2InUse = false;
11300
11301	int Size = Mask.size();
11302	int Scale = 16 / Size;
11303	for (int i = 0; i < 16; ++i) {
11304	if (Mask[i / Scale] < 0) {
11305	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11306	} else {
11307	const int ZeroMask = 0x80;
11308	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11309	: ZeroMask;
11310	int V2Idx = Mask[i / Scale] < Size
11311	? ZeroMask
11312	: (Mask[i / Scale] - Size) * Scale + i % Scale;
11313	if (Zeroable[i / Scale])
11314	V1Idx = V2Idx = ZeroMask;
11315	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11316	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11317	V1InUse \|= (ZeroMask != V1Idx);
11318	V2InUse \|= (ZeroMask != V2Idx);
11319	}
11320	}
11321
11322	if (V1InUse)
11323	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11324	DAG.getBitcast(MVT::v16i8, V1),
11325	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11326	if (V2InUse)
11327	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11328	DAG.getBitcast(MVT::v16i8, V2),
11329	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11330
11331	// If we need shuffled inputs from both, blend the two.
11332	SDValue V;
11333	if (V1InUse && V2InUse)
11334	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11335	else
11336	V = V1InUse ? V1 : V2;
11337
11338	// Cast the result back to the correct type.
11339	return DAG.getBitcast(VT, V);
11340	}
11341
11342	/// \brief Generic lowering of 8-lane i16 shuffles.
11343	///
11344	/// This handles both single-input shuffles and combined shuffle/blends with
11345	/// two inputs. The single input shuffles are immediately delegated to
11346	/// a dedicated lowering routine.
11347	///
11348	/// The blends are lowered in one of three fundamental ways. If there are few
11349	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11350	/// of the input is significantly cheaper when lowered as an interleaving of
11351	/// the two inputs, try to interleave them. Otherwise, blend the low and high
11352	/// halves of the inputs separately (making them have relatively few inputs)
11353	/// and then concatenate them.
11354	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11355	const APInt &Zeroable,
11356	SDValue V1, SDValue V2,
11357	const X86Subtarget &Subtarget,
11358	SelectionDAG &DAG) {
11359	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11359, __PRETTY_FUNCTION__));
11360	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11360, __PRETTY_FUNCTION__));
11361	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11361, __PRETTY_FUNCTION__));
11362
11363	// Whenever we can lower this as a zext, that instruction is strictly faster
11364	// than any alternative.
11365	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11366	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11367	return ZExt;
11368
11369	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11370
11371	if (NumV2Inputs == 0) {
11372	// Check for being able to broadcast a single element.
11373	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11374	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11375	return Broadcast;
11376
11377	// Try to use shift instructions.
11378	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11379	Zeroable, Subtarget, DAG))
11380	return Shift;
11381
11382	// Use dedicated unpack instructions for masks that match their pattern.
11383	if (SDValue V =
11384	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11385	return V;
11386
11387	// Try to use byte rotation instructions.
11388	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11389	Mask, Subtarget, DAG))
11390	return Rotate;
11391
11392	// Make a copy of the mask so it can be modified.
11393	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11394	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11395	MutableMask, Subtarget,
11396	DAG);
11397	}
11398
11399	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? static_cast<void> (0) : __assert_fail ( "llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11401, __PRETTY_FUNCTION__))
11400	"All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? static_cast<void> (0) : __assert_fail ( "llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11401, __PRETTY_FUNCTION__))
11401	"shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? static_cast<void> (0) : __assert_fail ( "llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11401, __PRETTY_FUNCTION__));
11402
11403	// Try to use shift instructions.
11404	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11405	Zeroable, Subtarget, DAG))
11406	return Shift;
11407
11408	// See if we can use SSE4A Extraction / Insertion.
11409	if (Subtarget.hasSSE4A())
11410	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11411	Zeroable, DAG))
11412	return V;
11413
11414	// There are special ways we can lower some single-element blends.
11415	if (NumV2Inputs == 1)
11416	if (SDValue V = lowerVectorShuffleAsElementInsertion(
11417	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11418	return V;
11419
11420	// We have different paths for blend lowering, but they all must use the
11421	// exact same predicate.
11422	bool IsBlendSupported = Subtarget.hasSSE41();
11423	if (IsBlendSupported)
11424	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11425	Zeroable, Subtarget, DAG))
11426	return Blend;
11427
11428	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11429	Zeroable, DAG))
11430	return Masked;
11431
11432	// Use dedicated unpack instructions for masks that match their pattern.
11433	if (SDValue V =
11434	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11435	return V;
11436
11437	// Try to use byte rotation instructions.
11438	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11439	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11440	return Rotate;
11441
11442	if (SDValue BitBlend =
11443	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11444	return BitBlend;
11445
11446	// Try to lower by permuting the inputs into an unpack instruction.
11447	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11448	V2, Mask, DAG))
11449	return Unpack;
11450
11451	// If we can't directly blend but can use PSHUFB, that will be better as it
11452	// can both shuffle and set up the inefficient blend.
11453	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11454	bool V1InUse, V2InUse;
11455	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11456	Zeroable, DAG, V1InUse, V2InUse);
11457	}
11458
11459	// We can always bit-blend if we have to so the fallback strategy is to
11460	// decompose into single-input permutes and blends.
11461	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11462	Mask, DAG);
11463	}
11464
11465	/// \brief Check whether a compaction lowering can be done by dropping even
11466	/// elements and compute how many times even elements must be dropped.
11467	///
11468	/// This handles shuffles which take every Nth element where N is a power of
11469	/// two. Example shuffle masks:
11470	///
11471	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11472	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11473	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11474	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11475	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11476	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11477	///
11478	/// Any of these lanes can of course be undef.
11479	///
11480	/// This routine only supports N <= 3.
11481	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11482	/// for larger N.
11483	///
11484	/// \returns N above, or the number of times even elements must be dropped if
11485	/// there is such a number. Otherwise returns zero.
11486	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11487	bool IsSingleInput) {
11488	// The modulus for the shuffle vector entries is based on whether this is
11489	// a single input or not.
11490	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11491	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11492, __PRETTY_FUNCTION__))
11492	"We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11492, __PRETTY_FUNCTION__));
11493
11494	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11495
11496	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11497	// and 2^3 simultaneously. This is because we may have ambiguity with
11498	// partially undef inputs.
11499	bool ViableForN[3] = {true, true, true};
11500
11501	for (int i = 0, e = Mask.size(); i < e; ++i) {
11502	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
11503	// want.
11504	if (Mask[i] < 0)
11505	continue;
11506
11507	bool IsAnyViable = false;
11508	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11509	if (ViableForN[j]) {
11510	uint64_t N = j + 1;
11511
11512	// The shuffle mask must be equal to (i * 2^N) % M.
11513	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11514	IsAnyViable = true;
11515	else
11516	ViableForN[j] = false;
11517	}
11518	// Early exit if we exhaust the possible powers of two.
11519	if (!IsAnyViable)
11520	break;
11521	}
11522
11523	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11524	if (ViableForN[j])
11525	return j + 1;
11526
11527	// Return 0 as there is no viable power of two.
11528	return 0;
11529	}
11530
11531	/// \brief Generic lowering of v16i8 shuffles.
11532	///
11533	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11534	/// detect any complexity reducing interleaving. If that doesn't help, it uses
11535	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11536	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11537	/// back together.
11538	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11539	const APInt &Zeroable,
11540	SDValue V1, SDValue V2,
11541	const X86Subtarget &Subtarget,
11542	SelectionDAG &DAG) {
11543	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11543, __PRETTY_FUNCTION__));
11544	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11544, __PRETTY_FUNCTION__));
11545	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11545, __PRETTY_FUNCTION__));
11546
11547	// Try to use shift instructions.
11548	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11549	Zeroable, Subtarget, DAG))
11550	return Shift;
11551
11552	// Try to use byte rotation instructions.
11553	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11554	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11555	return Rotate;
11556
11557	// Try to use a zext lowering.
11558	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11559	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11560	return ZExt;
11561
11562	// See if we can use SSE4A Extraction / Insertion.
11563	if (Subtarget.hasSSE4A())
11564	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11565	Zeroable, DAG))
11566	return V;
11567
11568	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11569
11570	// For single-input shuffles, there are some nicer lowering tricks we can use.
11571	if (NumV2Elements == 0) {
11572	// Check for being able to broadcast a single element.
11573	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11574	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11575	return Broadcast;
11576
11577	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
11578	// Notably, this handles splat and partial-splat shuffles more efficiently.
11579	// However, it only makes sense if the pre-duplication shuffle simplifies
11580	// things significantly. Currently, this means we need to be able to
11581	// express the pre-duplication shuffle as an i16 shuffle.
11582	//
11583	// FIXME: We should check for other patterns which can be widened into an
11584	// i16 shuffle as well.
11585	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11586	for (int i = 0; i < 16; i += 2)
11587	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11588	return false;
11589
11590	return true;
11591	};
11592	auto tryToWidenViaDuplication = [&]() -> SDValue {
11593	if (!canWidenViaDuplication(Mask))
11594	return SDValue();
11595	SmallVector<int, 4> LoInputs;
11596	copy_if(Mask, std::back_inserter(LoInputs),
11597	[](int M) { return M >= 0 && M < 8; });
11598	std::sort(LoInputs.begin(), LoInputs.end());
11599	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11600	LoInputs.end());
11601	SmallVector<int, 4> HiInputs;
11602	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11603	std::sort(HiInputs.begin(), HiInputs.end());
11604	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11605	HiInputs.end());
11606
11607	bool TargetLo = LoInputs.size() >= HiInputs.size();
11608	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11609	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11610
11611	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11612	SmallDenseMap<int, int, 8> LaneMap;
11613	for (int I : InPlaceInputs) {
11614	PreDupI16Shuffle[I/2] = I/2;
11615	LaneMap[I] = I;
11616	}
11617	int j = TargetLo ? 0 : 4, je = j + 4;
11618	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11619	// Check if j is already a shuffle of this input. This happens when
11620	// there are two adjacent bytes after we move the low one.
11621	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11622	// If we haven't yet mapped the input, search for a slot into which
11623	// we can map it.
11624	while (j < je && PreDupI16Shuffle[j] >= 0)
11625	++j;
11626
11627	if (j == je)
11628	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11629	return SDValue();
11630
11631	// Map this input with the i16 shuffle.
11632	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11633	}
11634
11635	// Update the lane map based on the mapping we ended up with.
11636	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11637	}
11638	V1 = DAG.getBitcast(
11639	MVT::v16i8,
11640	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11641	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11642
11643	// Unpack the bytes to form the i16s that will be shuffled into place.
11644	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11645	MVT::v16i8, V1, V1);
11646
11647	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11648	for (int i = 0; i < 16; ++i)
11649	if (Mask[i] >= 0) {
11650	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11651	assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast <void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11651, __PRETTY_FUNCTION__));
11652	if (PostDupI16Shuffle[i / 2] < 0)
11653	PostDupI16Shuffle[i / 2] = MappedMask;
11654	else
11655	assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!" ) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11656, __PRETTY_FUNCTION__))
11656	"Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!" ) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11656, __PRETTY_FUNCTION__));
11657	}
11658	return DAG.getBitcast(
11659	MVT::v16i8,
11660	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11661	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11662	};
11663	if (SDValue V = tryToWidenViaDuplication())
11664	return V;
11665	}
11666
11667	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11668	Zeroable, DAG))
11669	return Masked;
11670
11671	// Use dedicated unpack instructions for masks that match their pattern.
11672	if (SDValue V =
11673	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11674	return V;
11675
11676	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11677	// with PSHUFB. It is important to do this before we attempt to generate any
11678	// blends but after all of the single-input lowerings. If the single input
11679	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
11680	// want to preserve that and we can DAG combine any longer sequences into
11681	// a PSHUFB in the end. But once we start blending from multiple inputs,
11682	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
11683	// and there are very few patterns that would actually be faster than the
11684	// PSHUFB approach because of its ability to zero lanes.
11685	//
11686	// FIXME: The only exceptions to the above are blends which are exact
11687	// interleavings with direct instructions supporting them. We currently don't
11688	// handle those well here.
11689	if (Subtarget.hasSSSE3()) {
11690	bool V1InUse = false;
11691	bool V2InUse = false;
11692
11693	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11694	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11695
11696	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
11697	// do so. This avoids using them to handle blends-with-zero which is
11698	// important as a single pshufb is significantly faster for that.
11699	if (V1InUse && V2InUse) {
11700	if (Subtarget.hasSSE41())
11701	if (SDValue Blend = lowerVectorShuffleAsBlend(
11702	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11703	return Blend;
11704
11705	// We can use an unpack to do the blending rather than an or in some
11706	// cases. Even though the or may be (very minorly) more efficient, we
11707	// preference this lowering because there are common cases where part of
11708	// the complexity of the shuffles goes away when we do the final blend as
11709	// an unpack.
11710	// FIXME: It might be worth trying to detect if the unpack-feeding
11711	// shuffles will both be pshufb, in which case we shouldn't bother with
11712	// this.
11713	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11714	DL, MVT::v16i8, V1, V2, Mask, DAG))
11715	return Unpack;
11716	}
11717
11718	return PSHUFB;
11719	}
11720
11721	// There are special ways we can lower some single-element blends.
11722	if (NumV2Elements == 1)
11723	if (SDValue V = lowerVectorShuffleAsElementInsertion(
11724	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11725	return V;
11726
11727	if (SDValue BitBlend =
11728	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11729	return BitBlend;
11730
11731	// Check whether a compaction lowering can be done. This handles shuffles
11732	// which take every Nth element for some even N. See the helper function for
11733	// details.
11734	//
11735	// We special case these as they can be particularly efficiently handled with
11736	// the PACKUSB instruction on x86 and they show up in common patterns of
11737	// rearranging bytes to truncate wide elements.
11738	bool IsSingleInput = V2.isUndef();
11739	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11740	// NumEvenDrops is the power of two stride of the elements. Another way of
11741	// thinking about it is that we need to drop the even elements this many
11742	// times to get the original input.
11743
11744	// First we need to zero all the dropped bytes.
11745	assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times." ) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11746, __PRETTY_FUNCTION__))
11746	"No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times." ) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11746, __PRETTY_FUNCTION__));
11747	// We use the mask type to pick which bytes are preserved based on how many
11748	// elements are dropped.
11749	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11750	SDValue ByteClearMask = DAG.getBitcast(
11751	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11752	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11753	if (!IsSingleInput)
11754	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11755
11756	// Now pack things back together.
11757	V1 = DAG.getBitcast(MVT::v8i16, V1);
11758	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11759	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11760	for (int i = 1; i < NumEvenDrops; ++i) {
11761	Result = DAG.getBitcast(MVT::v8i16, Result);
11762	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11763	}
11764
11765	return Result;
11766	}
11767
11768	// Handle multi-input cases by blending single-input shuffles.
11769	if (NumV2Elements > 0)
11770	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11771	Mask, DAG);
11772
11773	// The fallback path for single-input shuffles widens this into two v8i16
11774	// vectors with unpacks, shuffles those, and then pulls them back together
11775	// with a pack.
11776	SDValue V = V1;
11777
11778	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11779	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11780	for (int i = 0; i < 16; ++i)
11781	if (Mask[i] >= 0)
11782	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11783
11784	SDValue VLoHalf, VHiHalf;
11785	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11786	// them out and avoid using UNPCK{L,H} to extract the elements of V as
11787	// i16s.
11788	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11789	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11790	// Use a mask to drop the high bytes.
11791	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11792	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11793	DAG.getConstant(0x00FF, DL, MVT::v8i16));
11794
11795	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11796	VHiHalf = DAG.getUNDEF(MVT::v8i16);
11797
11798	// Squash the masks to point directly into VLoHalf.
11799	for (int &M : LoBlendMask)
11800	if (M >= 0)
11801	M /= 2;
11802	for (int &M : HiBlendMask)
11803	if (M >= 0)
11804	M /= 2;
11805	} else {
11806	// Otherwise just unpack the low half of V into VLoHalf and the high half into
11807	// VHiHalf so that we can blend them as i16s.
11808	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11809
11810	VLoHalf = DAG.getBitcast(
11811	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11812	VHiHalf = DAG.getBitcast(
11813	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11814	}
11815
11816	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11817	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11818
11819	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11820	}
11821
11822	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11823	///
11824	/// This routine breaks down the specific type of 128-bit shuffle and
11825	/// dispatches to the lowering routines accordingly.
11826	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11827	MVT VT, SDValue V1, SDValue V2,
11828	const APInt &Zeroable,
11829	const X86Subtarget &Subtarget,
11830	SelectionDAG &DAG) {
11831	switch (VT.SimpleTy) {
11832	case MVT::v2i64:
11833	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11834	case MVT::v2f64:
11835	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11836	case MVT::v4i32:
11837	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11838	case MVT::v4f32:
11839	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11840	case MVT::v8i16:
11841	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11842	case MVT::v16i8:
11843	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11844
11845	default:
11846	llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11846);
11847	}
11848	}
11849
11850	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
11851	///
11852	/// This routine just extracts two subvectors, shuffles them independently, and
11853	/// then concatenates them back together. This should work effectively with all
11854	/// AVX vector shuffle types.
11855	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11856	SDValue V2, ArrayRef<int> Mask,
11857	SelectionDAG &DAG) {
11858	assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!" ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11859, __PRETTY_FUNCTION__))
11859	"Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!" ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11859, __PRETTY_FUNCTION__));
11860	assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11860, __PRETTY_FUNCTION__));
11861	assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11861, __PRETTY_FUNCTION__));
11862
11863	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11864	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11865
11866	int NumElements = VT.getVectorNumElements();
11867	int SplitNumElements = NumElements / 2;
11868	MVT ScalarVT = VT.getVectorElementType();
11869	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11870
11871	// Rather than splitting build-vectors, just build two narrower build
11872	// vectors. This helps shuffling with splats and zeros.
11873	auto SplitVector = [&](SDValue V) {
11874	V = peekThroughBitcasts(V);
11875
11876	MVT OrigVT = V.getSimpleValueType();
11877	int OrigNumElements = OrigVT.getVectorNumElements();
11878	int OrigSplitNumElements = OrigNumElements / 2;
11879	MVT OrigScalarVT = OrigVT.getVectorElementType();
11880	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11881
11882	SDValue LoV, HiV;
11883
11884	auto *BV = dyn_cast<BuildVectorSDNode>(V);
11885	if (!BV) {
11886	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11887	DAG.getIntPtrConstant(0, DL));
11888	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11889	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11890	} else {
11891
11892	SmallVector<SDValue, 16> LoOps, HiOps;
11893	for (int i = 0; i < OrigSplitNumElements; ++i) {
11894	LoOps.push_back(BV->getOperand(i));
11895	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11896	}
11897	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11898	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11899	}
11900	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11901	DAG.getBitcast(SplitVT, HiV));
11902	};
11903
11904	SDValue LoV1, HiV1, LoV2, HiV2;
11905	std::tie(LoV1, HiV1) = SplitVector(V1);
11906	std::tie(LoV2, HiV2) = SplitVector(V2);
11907
11908	// Now create two 4-way blends of these half-width vectors.
11909	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11910	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11911	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11912	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11913	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11914	for (int i = 0; i < SplitNumElements; ++i) {
11915	int M = HalfMask[i];
11916	if (M >= NumElements) {
11917	if (M >= NumElements + SplitNumElements)
11918	UseHiV2 = true;
11919	else
11920	UseLoV2 = true;
11921	V2BlendMask[i] = M - NumElements;
11922	BlendMask[i] = SplitNumElements + i;
11923	} else if (M >= 0) {
11924	if (M >= SplitNumElements)
11925	UseHiV1 = true;
11926	else
11927	UseLoV1 = true;
11928	V1BlendMask[i] = M;
11929	BlendMask[i] = i;
11930	}
11931	}
11932
11933	// Because the lowering happens after all combining takes place, we need to
11934	// manually combine these blend masks as much as possible so that we create
11935	// a minimal number of high-level vector shuffle nodes.
11936
11937	// First try just blending the halves of V1 or V2.
11938	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11939	return DAG.getUNDEF(SplitVT);
11940	if (!UseLoV2 && !UseHiV2)
11941	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11942	if (!UseLoV1 && !UseHiV1)
11943	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11944
11945	SDValue V1Blend, V2Blend;
11946	if (UseLoV1 && UseHiV1) {
11947	V1Blend =
11948	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11949	} else {
11950	// We only use half of V1 so map the usage down into the final blend mask.
11951	V1Blend = UseLoV1 ? LoV1 : HiV1;
11952	for (int i = 0; i < SplitNumElements; ++i)
11953	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11954	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11955	}
11956	if (UseLoV2 && UseHiV2) {
11957	V2Blend =
11958	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11959	} else {
11960	// We only use half of V2 so map the usage down into the final blend mask.
11961	V2Blend = UseLoV2 ? LoV2 : HiV2;
11962	for (int i = 0; i < SplitNumElements; ++i)
11963	if (BlendMask[i] >= SplitNumElements)
11964	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11965	}
11966	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11967	};
11968	SDValue Lo = HalfBlend(LoMask);
11969	SDValue Hi = HalfBlend(HiMask);
11970	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11971	}
11972
11973	/// \brief Either split a vector in halves or decompose the shuffles and the
11974	/// blend.
11975	///
11976	/// This is provided as a good fallback for many lowerings of non-single-input
11977	/// shuffles with more than one 128-bit lane. In those cases, we want to select
11978	/// between splitting the shuffle into 128-bit components and stitching those
11979	/// back together vs. extracting the single-input shuffles and blending those
11980	/// results.
11981	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11982	SDValue V1, SDValue V2,
11983	ArrayRef<int> Mask,
11984	SelectionDAG &DAG) {
11985	assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself.") ? static_cast <void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11986, __PRETTY_FUNCTION__))
11986	"shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself.") ? static_cast <void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 11986, __PRETTY_FUNCTION__));
11987	int Size = Mask.size();
11988
11989	// If this can be modeled as a broadcast of two elements followed by a blend,
11990	// prefer that lowering. This is especially important because broadcasts can
11991	// often fold with memory operands.
11992	auto DoBothBroadcast = [&] {
11993	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11994	for (int M : Mask)
11995	if (M >= Size) {
11996	if (V2BroadcastIdx < 0)
11997	V2BroadcastIdx = M - Size;
11998	else if (M - Size != V2BroadcastIdx)
11999	return false;
12000	} else if (M >= 0) {
12001	if (V1BroadcastIdx < 0)
12002	V1BroadcastIdx = M;
12003	else if (M != V1BroadcastIdx)
12004	return false;
12005	}
12006	return true;
12007	};
12008	if (DoBothBroadcast())
12009	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12010	DAG);
12011
12012	// If the inputs all stem from a single 128-bit lane of each input, then we
12013	// split them rather than blending because the split will decompose to
12014	// unusually few instructions.
12015	int LaneCount = VT.getSizeInBits() / 128;
12016	int LaneSize = Size / LaneCount;
12017	SmallBitVector LaneInputs[2];
12018	LaneInputs[0].resize(LaneCount, false);
12019	LaneInputs[1].resize(LaneCount, false);
12020	for (int i = 0; i < Size; ++i)
12021	if (Mask[i] >= 0)
12022	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12023	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12024	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12025
12026	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
12027	// that the decomposed single-input shuffles don't end up here.
12028	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12029	}
12030
12031	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12032	/// a permutation and blend of those lanes.
12033	///
12034	/// This essentially blends the out-of-lane inputs to each lane into the lane
12035	/// from a permuted copy of the vector. This lowering strategy results in four
12036	/// instructions in the worst case for a single-input cross lane shuffle which
12037	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
12038	/// of. Special cases for each particular shuffle pattern should be handled
12039	/// prior to trying this lowering.
12040	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12041	SDValue V1, SDValue V2,
12042	ArrayRef<int> Mask,
12043	SelectionDAG &DAG) {
12044	// FIXME: This should probably be generalized for 512-bit vectors as well.
12045	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!" ) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12045, __PRETTY_FUNCTION__));
12046	int Size = Mask.size();
12047	int LaneSize = Size / 2;
12048
12049	// If there are only inputs from one 128-bit lane, splitting will in fact be
12050	// less expensive. The flags track whether the given lane contains an element
12051	// that crosses to another lane.
12052	bool LaneCrossing[2] = {false, false};
12053	for (int i = 0; i < Size; ++i)
12054	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12055	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12056	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
12057	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12058
12059	assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles" ) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12060, __PRETTY_FUNCTION__))
12060	"This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles" ) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12060, __PRETTY_FUNCTION__));
12061
12062	SmallVector<int, 32> FlippedBlendMask(Size);
12063	for (int i = 0; i < Size; ++i)
12064	FlippedBlendMask[i] =
12065	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12066	? Mask[i]
12067	: Mask[i] % LaneSize +
12068	(i / LaneSize) * LaneSize + Size);
12069
12070	// Flip the vector, and blend the results which should now be in-lane. The
12071	// VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12072	// 5 for the high source. The value 3 selects the high half of source 2 and
12073	// the value 2 selects the low half of source 2. We only use source 2 to
12074	// allow folding it into a memory operand.
12075	unsigned PERMMask = 3 \| 2 << 4;
12076	SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12077	V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12078	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12079	}
12080
12081	/// \brief Handle lowering 2-lane 128-bit shuffles.
12082	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12083	SDValue V2, ArrayRef<int> Mask,
12084	const APInt &Zeroable,
12085	const X86Subtarget &Subtarget,
12086	SelectionDAG &DAG) {
12087	SmallVector<int, 4> WidenedMask;
12088	if (!canWidenShuffleElements(Mask, WidenedMask))
12089	return SDValue();
12090
12091	// TODO: If minimizing size and one of the inputs is a zero vector and the
12092	// the zero vector has only one use, we could use a VPERM2X128 to save the
12093	// instruction bytes needed to explicitly generate the zero vector.
12094
12095	// Blends are faster and handle all the non-lane-crossing cases.
12096	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12097	Zeroable, Subtarget, DAG))
12098	return Blend;
12099
12100	bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12101	bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12102
12103	// If either input operand is a zero vector, use VPERM2X128 because its mask
12104	// allows us to replace the zero input with an implicit zero.
12105	if (!IsV1Zero && !IsV2Zero) {
12106	// Check for patterns which can be matched with a single insert of a 128-bit
12107	// subvector.
12108	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12109	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12110	// With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12111	if (Subtarget.hasAVX2() && V2.isUndef())
12112	return SDValue();
12113
12114	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12115	// this will likely become vinsertf128 which can't fold a 256-bit memop.
12116	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12117	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12118	VT.getVectorNumElements() / 2);
12119	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12120	DAG.getIntPtrConstant(0, DL));
12121	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12122	OnlyUsesV1 ? V1 : V2,
12123	DAG.getIntPtrConstant(0, DL));
12124	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12125	}
12126	}
12127	}
12128
12129	// Otherwise form a 128-bit permutation. After accounting for undefs,
12130	// convert the 64-bit shuffle mask selection values into 128-bit
12131	// selection bits by dividing the indexes by 2 and shifting into positions
12132	// defined by a vperm2*128 instruction's immediate control byte.
12133
12134	// The immediate permute control byte looks like this:
12135	// [1:0] - select 128 bits from sources for low half of destination
12136	// [2] - ignore
12137	// [3] - zero low half of destination
12138	// [5:4] - select 128 bits from sources for high half of destination
12139	// [6] - ignore
12140	// [7] - zero high half of destination
12141
12142	int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12143	int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12144
12145	unsigned PermMask = MaskLO \| (MaskHI << 4);
12146
12147	// If either input is a zero vector, replace it with an undef input.
12148	// Shuffle mask values < 4 are selecting elements of V1.
12149	// Shuffle mask values >= 4 are selecting elements of V2.
12150	// Adjust each half of the permute mask by clearing the half that was
12151	// selecting the zero vector and setting the zero mask bit.
12152	if (IsV1Zero) {
12153	V1 = DAG.getUNDEF(VT);
12154	if (MaskLO < 2)
12155	PermMask = (PermMask & 0xf0) \| 0x08;
12156	if (MaskHI < 2)
12157	PermMask = (PermMask & 0x0f) \| 0x80;
12158	}
12159	if (IsV2Zero) {
12160	V2 = DAG.getUNDEF(VT);
12161	if (MaskLO >= 2)
12162	PermMask = (PermMask & 0xf0) \| 0x08;
12163	if (MaskHI >= 2)
12164	PermMask = (PermMask & 0x0f) \| 0x80;
12165	}
12166
12167	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12168	DAG.getConstant(PermMask, DL, MVT::i8));
12169	}
12170
12171	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12172	/// shuffling each lane.
12173	///
12174	/// This will only succeed when the result of fixing the 128-bit lanes results
12175	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12176	/// each 128-bit lanes. This handles many cases where we can quickly blend away
12177	/// the lane crosses early and then use simpler shuffles within each lane.
12178	///
12179	/// FIXME: It might be worthwhile at some point to support this without
12180	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12181	/// in x86 only floating point has interesting non-repeating shuffles, and even
12182	/// those are still marginally more expensive.
12183	static SDValue lowerVectorShuffleByMerging128BitLanes(
12184	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12185	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12186	assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs." ) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12186, __PRETTY_FUNCTION__));
12187
12188	int Size = Mask.size();
12189	int LaneSize = 128 / VT.getScalarSizeInBits();
12190	int NumLanes = Size / LaneSize;
12191	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.")((NumLanes > 1 && "Only handles 256-bit and wider shuffles." ) ? static_cast<void> (0) : __assert_fail ("NumLanes > 1 && \"Only handles 256-bit and wider shuffles.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12191, __PRETTY_FUNCTION__));
12192
12193	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12194	// check whether the in-128-bit lane shuffles share a repeating pattern.
12195	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12196	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12197	for (int i = 0; i < Size; ++i) {
12198	if (Mask[i] < 0)
12199	continue;
12200
12201	int j = i / LaneSize;
12202
12203	if (Lanes[j] < 0) {
12204	// First entry we've seen for this lane.
12205	Lanes[j] = Mask[i] / LaneSize;
12206	} else if (Lanes[j] != Mask[i] / LaneSize) {
12207	// This doesn't match the lane selected previously!
12208	return SDValue();
12209	}
12210
12211	// Check that within each lane we have a consistent shuffle mask.
12212	int k = i % LaneSize;
12213	if (InLaneMask[k] < 0) {
12214	InLaneMask[k] = Mask[i] % LaneSize;
12215	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
12216	// This doesn't fit a repeating in-lane mask.
12217	return SDValue();
12218	}
12219	}
12220
12221	// First shuffle the lanes into place.
12222	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12223	VT.getSizeInBits() / 64);
12224	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12225	for (int i = 0; i < NumLanes; ++i)
12226	if (Lanes[i] >= 0) {
12227	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12228	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12229	}
12230
12231	V1 = DAG.getBitcast(LaneVT, V1);
12232	V2 = DAG.getBitcast(LaneVT, V2);
12233	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12234
12235	// Cast it back to the type we actually want.
12236	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12237
12238	// Now do a simple shuffle that isn't lane crossing.
12239	SmallVector<int, 8> NewMask((unsigned)Size, -1);
12240	for (int i = 0; i < Size; ++i)
12241	if (Mask[i] >= 0)
12242	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12243	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!" ) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12244, __PRETTY_FUNCTION__))
12244	"Must not introduce lane crosses at this point!")((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!" ) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12244, __PRETTY_FUNCTION__));
12245
12246	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12247	}
12248
12249	/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12250	/// This allows for fast cases such as subvector extraction/insertion
12251	/// or shuffling smaller vector types which can lower more efficiently.
12252	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12253	SDValue V1, SDValue V2,
12254	ArrayRef<int> Mask,
12255	const X86Subtarget &Subtarget,
12256	SelectionDAG &DAG) {
12257	assert(VT.is256BitVector() && "Expected 256-bit vector")((VT.is256BitVector() && "Expected 256-bit vector") ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Expected 256-bit vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12257, __PRETTY_FUNCTION__));
12258
12259	unsigned NumElts = VT.getVectorNumElements();
12260	unsigned HalfNumElts = NumElts / 2;
12261	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12262
12263	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12264	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12265	if (!UndefLower && !UndefUpper)
12266	return SDValue();
12267
12268	// Upper half is undef and lower half is whole upper subvector.
12269	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12270	if (UndefUpper &&
12271	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12272	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12273	DAG.getIntPtrConstant(HalfNumElts, DL));
12274	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12275	DAG.getIntPtrConstant(0, DL));
12276	}
12277
12278	// Lower half is undef and upper half is whole lower subvector.
12279	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12280	if (UndefLower &&
12281	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12282	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12283	DAG.getIntPtrConstant(0, DL));
12284	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12285	DAG.getIntPtrConstant(HalfNumElts, DL));
12286	}
12287
12288	// If the shuffle only uses two of the four halves of the input operands,
12289	// then extract them and perform the 'half' shuffle at half width.
12290	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12291	int HalfIdx1 = -1, HalfIdx2 = -1;
12292	SmallVector<int, 8> HalfMask(HalfNumElts);
12293	unsigned Offset = UndefLower ? HalfNumElts : 0;
12294	for (unsigned i = 0; i != HalfNumElts; ++i) {
12295	int M = Mask[i + Offset];
12296	if (M < 0) {
12297	HalfMask[i] = M;
12298	continue;
12299	}
12300
12301	// Determine which of the 4 half vectors this element is from.
12302	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12303	int HalfIdx = M / HalfNumElts;
12304
12305	// Determine the element index into its half vector source.
12306	int HalfElt = M % HalfNumElts;
12307
12308	// We can shuffle with up to 2 half vectors, set the new 'half'
12309	// shuffle mask accordingly.
12310	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
12311	HalfMask[i] = HalfElt;
12312	HalfIdx1 = HalfIdx;
12313	continue;
12314	}
12315	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
12316	HalfMask[i] = HalfElt + HalfNumElts;
12317	HalfIdx2 = HalfIdx;
12318	continue;
12319	}
12320
12321	// Too many half vectors referenced.
12322	return SDValue();
12323	}
12324	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length" ) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12324, __PRETTY_FUNCTION__));
12325
12326	// Only shuffle the halves of the inputs when useful.
12327	int NumLowerHalves =
12328	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
12329	int NumUpperHalves =
12330	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
12331
12332	// uuuuXXXX - don't extract uppers just to insert again.
12333	if (UndefLower && NumUpperHalves != 0)
12334	return SDValue();
12335
12336	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12337	if (UndefUpper && NumUpperHalves == 2)
12338	return SDValue();
12339
12340	// AVX2 - XXXXuuuu - always extract lowers.
12341	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12342	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12343	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
12344	return SDValue();
12345	// AVX2 supports variable 32-bit element cross-lane shuffles.
12346	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
12347	// XXXXuuuu - don't extract lowers and uppers.
12348	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12349	return SDValue();
12350	}
12351	}
12352
12353	auto GetHalfVector = [&](int HalfIdx) {
12354	if (HalfIdx < 0)
12355	return DAG.getUNDEF(HalfVT);
12356	SDValue V = (HalfIdx < 2 ? V1 : V2);
12357	HalfIdx = (HalfIdx % 2) * HalfNumElts;
12358	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12359	DAG.getIntPtrConstant(HalfIdx, DL));
12360	};
12361
12362	SDValue Half1 = GetHalfVector(HalfIdx1);
12363	SDValue Half2 = GetHalfVector(HalfIdx2);
12364	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12365	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12366	DAG.getIntPtrConstant(Offset, DL));
12367	}
12368
12369	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
12370	/// given mask.
12371	///
12372	/// This returns true if the elements from a particular input are already in the
12373	/// slot required by the given mask and require no permutation.
12374	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12375	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.")(((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles." ) ? static_cast<void> (0) : __assert_fail ("(Input == 0 \|\| Input == 1) && \"Only two inputs to shuffles.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12375, __PRETTY_FUNCTION__));
12376	int Size = Mask.size();
12377	for (int i = 0; i < Size; ++i)
12378	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12379	return false;
12380
12381	return true;
12382	}
12383
12384	/// Handle case where shuffle sources are coming from the same 128-bit lane and
12385	/// every lane can be represented as the same repeating mask - allowing us to
12386	/// shuffle the sources with the repeating shuffle and then permute the result
12387	/// to the destination lanes.
12388	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12389	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12390	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12391	int NumElts = VT.getVectorNumElements();
12392	int NumLanes = VT.getSizeInBits() / 128;
12393	int NumLaneElts = NumElts / NumLanes;
12394
12395	// On AVX2 we may be able to just shuffle the lowest elements and then
12396	// broadcast the result.
12397	if (Subtarget.hasAVX2()) {
12398	for (unsigned BroadcastSize : {16, 32, 64}) {
12399	if (BroadcastSize <= VT.getScalarSizeInBits())
12400	continue;
12401	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12402
12403	// Attempt to match a repeating pattern every NumBroadcastElts,
12404	// accounting for UNDEFs but only references the lowest 128-bit
12405	// lane of the inputs.
12406	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12407	for (int i = 0; i != NumElts; i += NumBroadcastElts)
12408	for (int j = 0; j != NumBroadcastElts; ++j) {
12409	int M = Mask[i + j];
12410	if (M < 0)
12411	continue;
12412	int &R = RepeatMask[j];
12413	if (0 != ((M % NumElts) / NumLaneElts))
12414	return false;
12415	if (0 <= R && R != M)
12416	return false;
12417	R = M;
12418	}
12419	return true;
12420	};
12421
12422	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12423	if (!FindRepeatingBroadcastMask(RepeatMask))
12424	continue;
12425
12426	// Shuffle the (lowest) repeated elements in place for broadcast.
12427	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12428
12429	// Shuffle the actual broadcast.
12430	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12431	for (int i = 0; i != NumElts; i += NumBroadcastElts)
12432	for (int j = 0; j != NumBroadcastElts; ++j)
12433	BroadcastMask[i + j] = j;
12434	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12435	BroadcastMask);
12436	}
12437	}
12438
12439	// Bail if the shuffle mask doesn't cross 128-bit lanes.
12440	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12441	return SDValue();
12442
12443	// Bail if we already have a repeated lane shuffle mask.
12444	SmallVector<int, 8> RepeatedShuffleMask;
12445	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12446	return SDValue();
12447
12448	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12449	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12450	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12451	int NumSubLanes = NumLanes * SubLaneScale;
12452	int NumSubLaneElts = NumLaneElts / SubLaneScale;
12453
12454	// Check that all the sources are coming from the same lane and see if we can
12455	// form a repeating shuffle mask (local to each sub-lane). At the same time,
12456	// determine the source sub-lane for each destination sub-lane.
12457	int TopSrcSubLane = -1;
12458	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12459	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12460	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12461	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12462
12463	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12464	// Extract the sub-lane mask, check that it all comes from the same lane
12465	// and normalize the mask entries to come from the first lane.
12466	int SrcLane = -1;
12467	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12468	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12469	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12470	if (M < 0)
12471	continue;
12472	int Lane = (M % NumElts) / NumLaneElts;
12473	if ((0 <= SrcLane) && (SrcLane != Lane))
12474	return SDValue();
12475	SrcLane = Lane;
12476	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12477	SubLaneMask[Elt] = LocalM;
12478	}
12479
12480	// Whole sub-lane is UNDEF.
12481	if (SrcLane < 0)
12482	continue;
12483
12484	// Attempt to match against the candidate repeated sub-lane masks.
12485	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12486	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12487	for (int i = 0; i != NumSubLaneElts; ++i) {
12488	if (M1[i] < 0 \|\| M2[i] < 0)
12489	continue;
12490	if (M1[i] != M2[i])
12491	return false;
12492	}
12493	return true;
12494	};
12495
12496	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12497	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12498	continue;
12499
12500	// Merge the sub-lane mask into the matching repeated sub-lane mask.
12501	for (int i = 0; i != NumSubLaneElts; ++i) {
12502	int M = SubLaneMask[i];
12503	if (M < 0)
12504	continue;
12505	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) && "Unexpected mask element") ? static_cast<void > (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12506, __PRETTY_FUNCTION__))
12506	"Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) && "Unexpected mask element") ? static_cast<void > (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12506, __PRETTY_FUNCTION__));
12507	RepeatedSubLaneMask[i] = M;
12508	}
12509
12510	// Track the top most source sub-lane - by setting the remaining to UNDEF
12511	// we can greatly simplify shuffle matching.
12512	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12513	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12514	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12515	break;
12516	}
12517
12518	// Bail if we failed to find a matching repeated sub-lane mask.
12519	if (Dst2SrcSubLanes[DstSubLane] < 0)
12520	return SDValue();
12521	}
12522	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane") ? static_cast<void> (0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12523, __PRETTY_FUNCTION__))
12523	"Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane") ? static_cast<void> (0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12523, __PRETTY_FUNCTION__));
12524
12525	// Create a repeating shuffle mask for the entire vector.
12526	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12527	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12528	int Lane = SubLane / SubLaneScale;
12529	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12530	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12531	int M = RepeatedSubLaneMask[Elt];
12532	if (M < 0)
12533	continue;
12534	int Idx = (SubLane * NumSubLaneElts) + Elt;
12535	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12536	}
12537	}
12538	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12539
12540	// Shuffle each source sub-lane to its destination.
12541	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12542	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12543	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12544	if (SrcSubLane < 0)
12545	continue;
12546	for (int j = 0; j != NumSubLaneElts; ++j)
12547	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12548	}
12549
12550	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12551	SubLaneMask);
12552	}
12553
12554	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12555	unsigned &ShuffleImm,
12556	ArrayRef<int> Mask) {
12557	int NumElts = VT.getVectorNumElements();
12558	assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12560, __PRETTY_FUNCTION__))
12559	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12560, __PRETTY_FUNCTION__))
12560	"Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12560, __PRETTY_FUNCTION__));
12561
12562	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12563	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12564	ShuffleImm = 0;
12565	bool ShufpdMask = true;
12566	bool CommutableMask = true;
12567	for (int i = 0; i < NumElts; ++i) {
12568	if (Mask[i] == SM_SentinelUndef)
12569	continue;
12570	if (Mask[i] < 0)
12571	return false;
12572	int Val = (i & 6) + NumElts * (i & 1);
12573	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12574	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
12575	ShufpdMask = false;
12576	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
12577	CommutableMask = false;
12578	ShuffleImm \|= (Mask[i] % 2) << i;
12579	}
12580
12581	if (ShufpdMask)
12582	return true;
12583	if (CommutableMask) {
12584	std::swap(V1, V2);
12585	return true;
12586	}
12587
12588	return false;
12589	}
12590
12591	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12592	ArrayRef<int> Mask, SDValue V1,
12593	SDValue V2, SelectionDAG &DAG) {
12594	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&(((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD") ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12595, __PRETTY_FUNCTION__))
12595	"Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD") ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12595, __PRETTY_FUNCTION__));
12596
12597	unsigned Immediate = 0;
12598	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12599	return SDValue();
12600
12601	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12602	DAG.getConstant(Immediate, DL, MVT::i8));
12603	}
12604
12605	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12606	ArrayRef<int> Mask, SDValue V1,
12607	SDValue V2, SelectionDAG &DAG) {
12608	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12609	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12610
12611	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12612	if (V2.isUndef())
12613	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12614
12615	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12616	}
12617
12618	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12619	///
12620	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12621	/// isn't available.
12622	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12623	const APInt &Zeroable,
12624	SDValue V1, SDValue V2,
12625	const X86Subtarget &Subtarget,
12626	SelectionDAG &DAG) {
12627	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12627, __PRETTY_FUNCTION__));
12628	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12628, __PRETTY_FUNCTION__));
12629	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12629, __PRETTY_FUNCTION__));
12630
12631	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12632	Zeroable, Subtarget, DAG))
12633	return V;
12634
12635	if (V2.isUndef()) {
12636	// Check for being able to broadcast a single element.
12637	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12638	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12639	return Broadcast;
12640
12641	// Use low duplicate instructions for masks that match their pattern.
12642	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12643	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12644
12645	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12646	// Non-half-crossing single input shuffles can be lowered with an
12647	// interleaved permutation.
12648	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
12649	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
12650	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12651	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12652	}
12653
12654	// With AVX2 we have direct support for this permutation.
12655	if (Subtarget.hasAVX2())
12656	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12657	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12658
12659	// Try to create an in-lane repeating shuffle mask and then shuffle the
12660	// the results into the target lanes.
12661	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12662	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12663	return V;
12664
12665	// Otherwise, fall back.
12666	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12667	DAG);
12668	}
12669
12670	// Use dedicated unpack instructions for masks that match their pattern.
12671	if (SDValue V =
12672	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12673	return V;
12674
12675	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12676	Zeroable, Subtarget, DAG))
12677	return Blend;
12678
12679	// Check if the blend happens to exactly fit that of SHUFPD.
12680	if (SDValue Op =
12681	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12682	return Op;
12683
12684	// Try to create an in-lane repeating shuffle mask and then shuffle the
12685	// the results into the target lanes.
12686	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12687	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12688	return V;
12689
12690	// Try to simplify this by merging 128-bit lanes to enable a lane-based
12691	// shuffle. However, if we have AVX2 and either inputs are already in place,
12692	// we will be able to shuffle even across lanes the other input in a single
12693	// instruction so skip this pattern.
12694	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
12695	isShuffleMaskInputInPlace(1, Mask))))
12696	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12697	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12698	return Result;
12699	// If we have VLX support, we can use VEXPAND.
12700	if (Subtarget.hasVLX())
12701	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12702	V1, V2, DAG, Subtarget))
12703	return V;
12704
12705	// If we have AVX2 then we always want to lower with a blend because an v4 we
12706	// can fully permute the elements.
12707	if (Subtarget.hasAVX2())
12708	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12709	Mask, DAG);
12710
12711	// Otherwise fall back on generic lowering.
12712	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12713	}
12714
12715	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12716	///
12717	/// This routine is only called when we have AVX2 and thus a reasonable
12718	/// instruction set for v4i64 shuffling..
12719	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12720	const APInt &Zeroable,
12721	SDValue V1, SDValue V2,
12722	const X86Subtarget &Subtarget,
12723	SelectionDAG &DAG) {
12724	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12724, __PRETTY_FUNCTION__));
12725	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12725, __PRETTY_FUNCTION__));
12726	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12726, __PRETTY_FUNCTION__));
12727	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12727, __PRETTY_FUNCTION__));
12728
12729	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12730	Zeroable, Subtarget, DAG))
12731	return V;
12732
12733	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12734	Zeroable, Subtarget, DAG))
12735	return Blend;
12736
12737	// Check for being able to broadcast a single element.
12738	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12739	Mask, Subtarget, DAG))
12740	return Broadcast;
12741
12742	if (V2.isUndef()) {
12743	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
12744	// can use lower latency instructions that will operate on both lanes.
12745	SmallVector<int, 2> RepeatedMask;
12746	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12747	SmallVector<int, 4> PSHUFDMask;
12748	scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12749	return DAG.getBitcast(
12750	MVT::v4i64,
12751	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12752	DAG.getBitcast(MVT::v8i32, V1),
12753	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12754	}
12755
12756	// AVX2 provides a direct instruction for permuting a single input across
12757	// lanes.
12758	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12759	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12760	}
12761
12762	// Try to use shift instructions.
12763	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12764	Zeroable, Subtarget, DAG))
12765	return Shift;
12766
12767	// If we have VLX support, we can use VALIGN or VEXPAND.
12768	if (Subtarget.hasVLX()) {
12769	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12770	Mask, Subtarget, DAG))
12771	return Rotate;
12772
12773	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12774	V1, V2, DAG, Subtarget))
12775	return V;
12776	}
12777
12778	// Try to use PALIGNR.
12779	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12780	Mask, Subtarget, DAG))
12781	return Rotate;
12782
12783	// Use dedicated unpack instructions for masks that match their pattern.
12784	if (SDValue V =
12785	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12786	return V;
12787
12788	// Try to simplify this by merging 128-bit lanes to enable a lane-based
12789	// shuffle. However, if we have AVX2 and either inputs are already in place,
12790	// we will be able to shuffle even across lanes the other input in a single
12791	// instruction so skip this pattern.
12792	if (!isShuffleMaskInputInPlace(0, Mask) &&
12793	!isShuffleMaskInputInPlace(1, Mask))
12794	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12795	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12796	return Result;
12797
12798	// Otherwise fall back on generic blend lowering.
12799	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12800	Mask, DAG);
12801	}
12802
12803	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12804	///
12805	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12806	/// isn't available.
12807	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12808	const APInt &Zeroable,
12809	SDValue V1, SDValue V2,
12810	const X86Subtarget &Subtarget,
12811	SelectionDAG &DAG) {
12812	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12812, __PRETTY_FUNCTION__));
12813	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12813, __PRETTY_FUNCTION__));
12814	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12814, __PRETTY_FUNCTION__));
12815
12816	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12817	Zeroable, Subtarget, DAG))
12818	return Blend;
12819
12820	// Check for being able to broadcast a single element.
12821	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12822	Mask, Subtarget, DAG))
12823	return Broadcast;
12824
12825	// If the shuffle mask is repeated in each 128-bit lane, we have many more
12826	// options to efficiently lower the shuffle.
12827	SmallVector<int, 4> RepeatedMask;
12828	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12829	assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12830, __PRETTY_FUNCTION__))
12830	"Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12830, __PRETTY_FUNCTION__));
12831
12832	// Use even/odd duplicate instructions for masks that match their pattern.
12833	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12834	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12835	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12836	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12837
12838	if (V2.isUndef())
12839	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12840	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12841
12842	// Use dedicated unpack instructions for masks that match their pattern.
12843	if (SDValue V =
12844	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12845	return V;
12846
12847	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12848	// have already handled any direct blends.
12849	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12850	}
12851
12852	// Try to create an in-lane repeating shuffle mask and then shuffle the
12853	// the results into the target lanes.
12854	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12855	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12856	return V;
12857
12858	// If we have a single input shuffle with different shuffle patterns in the
12859	// two 128-bit lanes use the variable mask to VPERMILPS.
12860	if (V2.isUndef()) {
12861	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12862	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12863	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12864
12865	if (Subtarget.hasAVX2())
12866	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12867
12868	// Otherwise, fall back.
12869	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12870	DAG);
12871	}
12872
12873	// Try to simplify this by merging 128-bit lanes to enable a lane-based
12874	// shuffle.
12875	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12876	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12877	return Result;
12878	// If we have VLX support, we can use VEXPAND.
12879	if (Subtarget.hasVLX())
12880	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12881	V1, V2, DAG, Subtarget))
12882	return V;
12883
12884	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12885	// since after split we get a more efficient code using vpunpcklwd and
12886	// vpunpckhwd instrs than vblend.
12887	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12888	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12889	Mask, DAG))
12890	return V;
12891
12892	// If we have AVX2 then we always want to lower with a blend because at v8 we
12893	// can fully permute the elements.
12894	if (Subtarget.hasAVX2())
12895	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12896	Mask, DAG);
12897
12898	// Otherwise fall back on generic lowering.
12899	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12900	}
12901
12902	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12903	///
12904	/// This routine is only called when we have AVX2 and thus a reasonable
12905	/// instruction set for v8i32 shuffling..
12906	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12907	const APInt &Zeroable,
12908	SDValue V1, SDValue V2,
12909	const X86Subtarget &Subtarget,
12910	SelectionDAG &DAG) {
12911	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12911, __PRETTY_FUNCTION__));
12912	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12912, __PRETTY_FUNCTION__));
12913	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12913, __PRETTY_FUNCTION__));
12914	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12914, __PRETTY_FUNCTION__));
12915
12916	// Whenever we can lower this as a zext, that instruction is strictly faster
12917	// than any alternative. It also allows us to fold memory operands into the
12918	// shuffle in many cases.
12919	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12920	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12921	return ZExt;
12922
12923	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12924	// since after split we get a more efficient code than vblend by using
12925	// vpunpcklwd and vpunpckhwd instrs.
12926	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12927	!Subtarget.hasAVX512())
12928	if (SDValue V =
12929	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12930	return V;
12931
12932	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12933	Zeroable, Subtarget, DAG))
12934	return Blend;
12935
12936	// Check for being able to broadcast a single element.
12937	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12938	Mask, Subtarget, DAG))
12939	return Broadcast;
12940
12941	// If the shuffle mask is repeated in each 128-bit lane we can use more
12942	// efficient instructions that mirror the shuffles across the two 128-bit
12943	// lanes.
12944	SmallVector<int, 4> RepeatedMask;
12945	bool Is128BitLaneRepeatedShuffle =
12946	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12947	if (Is128BitLaneRepeatedShuffle) {
12948	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 12948, __PRETTY_FUNCTION__));
12949	if (V2.isUndef())
12950	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12951	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12952
12953	// Use dedicated unpack instructions for masks that match their pattern.
12954	if (SDValue V =
12955	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12956	return V;
12957	}
12958
12959	// Try to use shift instructions.
12960	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12961	Zeroable, Subtarget, DAG))
12962	return Shift;
12963
12964	// If we have VLX support, we can use VALIGN or EXPAND.
12965	if (Subtarget.hasVLX()) {
12966	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12967	Mask, Subtarget, DAG))
12968	return Rotate;
12969
12970	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12971	V1, V2, DAG, Subtarget))
12972	return V;
12973	}
12974
12975	// Try to use byte rotation instructions.
12976	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12977	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12978	return Rotate;
12979
12980	// Try to create an in-lane repeating shuffle mask and then shuffle the
12981	// results into the target lanes.
12982	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12983	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12984	return V;
12985
12986	// If the shuffle patterns aren't repeated but it is a single input, directly
12987	// generate a cross-lane VPERMD instruction.
12988	if (V2.isUndef()) {
12989	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12990	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12991	}
12992
12993	// Assume that a single SHUFPS is faster than an alternative sequence of
12994	// multiple instructions (even if the CPU has a domain penalty).
12995	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
12996	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12997	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12998	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12999	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13000	CastV1, CastV2, DAG);
13001	return DAG.getBitcast(MVT::v8i32, ShufPS);
13002	}
13003
13004	// Try to simplify this by merging 128-bit lanes to enable a lane-based
13005	// shuffle.
13006	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13007	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13008	return Result;
13009
13010	// Otherwise fall back on generic blend lowering.
13011	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13012	Mask, DAG);
13013	}
13014
13015	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13016	///
13017	/// This routine is only called when we have AVX2 and thus a reasonable
13018	/// instruction set for v16i16 shuffling..
13019	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13020	const APInt &Zeroable,
13021	SDValue V1, SDValue V2,
13022	const X86Subtarget &Subtarget,
13023	SelectionDAG &DAG) {
13024	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13024, __PRETTY_FUNCTION__));
13025	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13025, __PRETTY_FUNCTION__));
13026	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13026, __PRETTY_FUNCTION__));
13027	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13027, __PRETTY_FUNCTION__));
13028
13029	// Whenever we can lower this as a zext, that instruction is strictly faster
13030	// than any alternative. It also allows us to fold memory operands into the
13031	// shuffle in many cases.
13032	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13033	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13034	return ZExt;
13035
13036	// Check for being able to broadcast a single element.
13037	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13038	Mask, Subtarget, DAG))
13039	return Broadcast;
13040
13041	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13042	Zeroable, Subtarget, DAG))
13043	return Blend;
13044
13045	// Use dedicated unpack instructions for masks that match their pattern.
13046	if (SDValue V =
13047	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13048	return V;
13049
13050	// Try to use shift instructions.
13051	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13052	Zeroable, Subtarget, DAG))
13053	return Shift;
13054
13055	// Try to use byte rotation instructions.
13056	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13057	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13058	return Rotate;
13059
13060	// Try to create an in-lane repeating shuffle mask and then shuffle the
13061	// the results into the target lanes.
13062	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13063	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13064	return V;
13065
13066	if (V2.isUndef()) {
13067	// There are no generalized cross-lane shuffle operations available on i16
13068	// element types.
13069	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13070	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13071	Mask, DAG);
13072
13073	SmallVector<int, 8> RepeatedMask;
13074	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13075	// As this is a single-input shuffle, the repeated mask should be
13076	// a strictly valid v8i16 mask that we can pass through to the v8i16
13077	// lowering to handle even the v16 case.
13078	return lowerV8I16GeneralSingleInputVectorShuffle(
13079	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13080	}
13081	}
13082
13083	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13084	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13085	return PSHUFB;
13086
13087	// AVX512BWVL can lower to VPERMW.
13088	if (Subtarget.hasBWI() && Subtarget.hasVLX())
13089	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13090
13091	// Try to simplify this by merging 128-bit lanes to enable a lane-based
13092	// shuffle.
13093	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13094	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13095	return Result;
13096
13097	// Otherwise fall back on generic lowering.
13098	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13099	}
13100
13101	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13102	///
13103	/// This routine is only called when we have AVX2 and thus a reasonable
13104	/// instruction set for v32i8 shuffling..
13105	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13106	const APInt &Zeroable,
13107	SDValue V1, SDValue V2,
13108	const X86Subtarget &Subtarget,
13109	SelectionDAG &DAG) {
13110	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13110, __PRETTY_FUNCTION__));
13111	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13111, __PRETTY_FUNCTION__));
13112	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13112, __PRETTY_FUNCTION__));
13113	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13113, __PRETTY_FUNCTION__));
13114
13115	// Whenever we can lower this as a zext, that instruction is strictly faster
13116	// than any alternative. It also allows us to fold memory operands into the
13117	// shuffle in many cases.
13118	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13119	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13120	return ZExt;
13121
13122	// Check for being able to broadcast a single element.
13123	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13124	Mask, Subtarget, DAG))
13125	return Broadcast;
13126
13127	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13128	Zeroable, Subtarget, DAG))
13129	return Blend;
13130
13131	// Use dedicated unpack instructions for masks that match their pattern.
13132	if (SDValue V =
13133	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13134	return V;
13135
13136	// Try to use shift instructions.
13137	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13138	Zeroable, Subtarget, DAG))
13139	return Shift;
13140
13141	// Try to use byte rotation instructions.
13142	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13143	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13144	return Rotate;
13145
13146	// Try to create an in-lane repeating shuffle mask and then shuffle the
13147	// the results into the target lanes.
13148	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13149	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13150	return V;
13151
13152	// There are no generalized cross-lane shuffle operations available on i8
13153	// element types.
13154	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13155	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13156	DAG);
13157
13158	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13159	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13160	return PSHUFB;
13161
13162	// Try to simplify this by merging 128-bit lanes to enable a lane-based
13163	// shuffle.
13164	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13165	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13166	return Result;
13167
13168	// Otherwise fall back on generic lowering.
13169	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13170	}
13171
13172	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13173	///
13174	/// This routine either breaks down the specific type of a 256-bit x86 vector
13175	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
13176	/// together based on the available instructions.
13177	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13178	MVT VT, SDValue V1, SDValue V2,
13179	const APInt &Zeroable,
13180	const X86Subtarget &Subtarget,
13181	SelectionDAG &DAG) {
13182	// If we have a single input to the zero element, insert that into V1 if we
13183	// can do so cheaply.
13184	int NumElts = VT.getVectorNumElements();
13185	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13186
13187	if (NumV2Elements == 1 && Mask[0] >= NumElts)
13188	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13189	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13190	return Insertion;
13191
13192	// Handle special cases where the lower or upper half is UNDEF.
13193	if (SDValue V =
13194	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13195	return V;
13196
13197	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
13198	// can check for those subtargets here and avoid much of the subtarget
13199	// querying in the per-vector-type lowering routines. With AVX1 we have
13200	// essentially zero ability to manipulate a 256-bit vector with integer
13201	// types. Since we'll use floating point types there eventually, just
13202	// immediately cast everything to a float and operate entirely in that domain.
13203	if (VT.isInteger() && !Subtarget.hasAVX2()) {
13204	int ElementBits = VT.getScalarSizeInBits();
13205	if (ElementBits < 32) {
13206	// No floating point type available, if we can't use the bit operations
13207	// for masking/blending then decompose into 128-bit vectors.
13208	if (SDValue V =
13209	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13210	return V;
13211	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13212	return V;
13213	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13214	}
13215
13216	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13217	VT.getVectorNumElements());
13218	V1 = DAG.getBitcast(FpVT, V1);
13219	V2 = DAG.getBitcast(FpVT, V2);
13220	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13221	}
13222
13223	switch (VT.SimpleTy) {
13224	case MVT::v4f64:
13225	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13226	case MVT::v4i64:
13227	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13228	case MVT::v8f32:
13229	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13230	case MVT::v8i32:
13231	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13232	case MVT::v16i16:
13233	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13234	case MVT::v32i8:
13235	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13236
13237	default:
13238	llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13238);
13239	}
13240	}
13241
13242	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13243	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13244	ArrayRef<int> Mask, SDValue V1,
13245	SDValue V2, SelectionDAG &DAG) {
13246	assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle." ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13247, __PRETTY_FUNCTION__))
13247	"Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle." ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13247, __PRETTY_FUNCTION__));
13248
13249	// To handle 256 bit vector requires VLX and most probably
13250	// function lowerV2X128VectorShuffle() is better solution.
13251	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle." ) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13251, __PRETTY_FUNCTION__));
13252
13253	SmallVector<int, 4> WidenedMask;
13254	if (!canWidenShuffleElements(Mask, WidenedMask))
13255	return SDValue();
13256
13257	// Check for patterns which can be matched with a single insert of a 256-bit
13258	// subvector.
13259	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13260	{0, 1, 2, 3, 0, 1, 2, 3});
13261	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
13262	{0, 1, 2, 3, 8, 9, 10, 11})) {
13263	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13264	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13265	DAG.getIntPtrConstant(0, DL));
13266	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13267	OnlyUsesV1 ? V1 : V2,
13268	DAG.getIntPtrConstant(0, DL));
13269	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13270	}
13271
13272	assert(WidenedMask.size() == 4)((WidenedMask.size() == 4) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == 4", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13272, __PRETTY_FUNCTION__));
13273
13274	// See if this is an insertion of the lower 128-bits of V2 into V1.
13275	bool IsInsert = true;
13276	int V2Index = -1;
13277	for (int i = 0; i < 4; ++i) {
13278	assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail ("WidenedMask[i] >= -1", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13278, __PRETTY_FUNCTION__));
13279	if (WidenedMask[i] < 0)
13280	continue;
13281
13282	// Make sure all V1 subvectors are in place.
13283	if (WidenedMask[i] < 4) {
13284	if (WidenedMask[i] != i) {
13285	IsInsert = false;
13286	break;
13287	}
13288	} else {
13289	// Make sure we only have a single V2 index and its the lowest 128-bits.
13290	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
13291	IsInsert = false;
13292	break;
13293	}
13294	V2Index = i;
13295	}
13296	}
13297	if (IsInsert && V2Index >= 0) {
13298	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13299	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13300	DAG.getIntPtrConstant(0, DL));
13301	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13302	}
13303
13304	// Try to lower to to vshuf64x2/vshuf32x4.
13305	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13306	unsigned PermMask = 0;
13307	// Insure elements came from the same Op.
13308	for (int i = 0; i < 4; ++i) {
13309	assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail ("WidenedMask[i] >= -1", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13309, __PRETTY_FUNCTION__));
13310	if (WidenedMask[i] < 0)
13311	continue;
13312
13313	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13314	unsigned OpIndex = i / 2;
13315	if (Ops[OpIndex].isUndef())
13316	Ops[OpIndex] = Op;
13317	else if (Ops[OpIndex] != Op)
13318	return SDValue();
13319
13320	// Convert the 128-bit shuffle mask selection values into 128-bit selection
13321	// bits defined by a vshuf64x2 instruction's immediate control byte.
13322	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
13323	}
13324
13325	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13326	DAG.getConstant(PermMask, DL, MVT::i8));
13327	}
13328
13329	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13330	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13331	const APInt &Zeroable,
13332	SDValue V1, SDValue V2,
13333	const X86Subtarget &Subtarget,
13334	SelectionDAG &DAG) {
13335	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13335, __PRETTY_FUNCTION__));
13336	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13336, __PRETTY_FUNCTION__));
13337	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13337, __PRETTY_FUNCTION__));
13338
13339	if (V2.isUndef()) {
13340	// Use low duplicate instructions for masks that match their pattern.
13341	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13342	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13343
13344	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13345	// Non-half-crossing single input shuffles can be lowered with an
13346	// interleaved permutation.
13347	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
13348	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
13349	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
13350	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
13351	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13352	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13353	}
13354
13355	SmallVector<int, 4> RepeatedMask;
13356	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13357	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13358	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13359	}
13360
13361	if (SDValue Shuf128 =
13362	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13363	return Shuf128;
13364
13365	if (SDValue Unpck =
13366	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13367	return Unpck;
13368
13369	// Check if the blend happens to exactly fit that of SHUFPD.
13370	if (SDValue Op =
13371	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13372	return Op;
13373
13374	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13375	V2, DAG, Subtarget))
13376	return V;
13377
13378	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13379	Zeroable, Subtarget, DAG))
13380	return Blend;
13381
13382	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13383	}
13384
13385	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13386	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13387	const APInt &Zeroable,
13388	SDValue V1, SDValue V2,
13389	const X86Subtarget &Subtarget,
13390	SelectionDAG &DAG) {
13391	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13391, __PRETTY_FUNCTION__));
13392	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13392, __PRETTY_FUNCTION__));
13393	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13393, __PRETTY_FUNCTION__));
13394
13395	// If the shuffle mask is repeated in each 128-bit lane, we have many more
13396	// options to efficiently lower the shuffle.
13397	SmallVector<int, 4> RepeatedMask;
13398	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13399	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13399, __PRETTY_FUNCTION__));
13400
13401	// Use even/odd duplicate instructions for masks that match their pattern.
13402	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13403	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13404	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13405	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13406
13407	if (V2.isUndef())
13408	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13409	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13410
13411	// Use dedicated unpack instructions for masks that match their pattern.
13412	if (SDValue Unpck =
13413	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13414	return Unpck;
13415
13416	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13417	Zeroable, Subtarget, DAG))
13418	return Blend;
13419
13420	// Otherwise, fall back to a SHUFPS sequence.
13421	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13422	}
13423	// If we have AVX512F support, we can use VEXPAND.
13424	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13425	V1, V2, DAG, Subtarget))
13426	return V;
13427
13428	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13429	}
13430
13431	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13432	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13433	const APInt &Zeroable,
13434	SDValue V1, SDValue V2,
13435	const X86Subtarget &Subtarget,
13436	SelectionDAG &DAG) {
13437	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13437, __PRETTY_FUNCTION__));
13438	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13438, __PRETTY_FUNCTION__));
13439	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13439, __PRETTY_FUNCTION__));
13440
13441	if (SDValue Shuf128 =
13442	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13443	return Shuf128;
13444
13445	if (V2.isUndef()) {
13446	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
13447	// can use lower latency instructions that will operate on all four
13448	// 128-bit lanes.
13449	SmallVector<int, 2> Repeated128Mask;
13450	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13451	SmallVector<int, 4> PSHUFDMask;
13452	scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13453	return DAG.getBitcast(
13454	MVT::v8i64,
13455	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13456	DAG.getBitcast(MVT::v16i32, V1),
13457	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13458	}
13459
13460	SmallVector<int, 4> Repeated256Mask;
13461	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13462	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13463	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13464	}
13465
13466	// Try to use shift instructions.
13467	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13468	Zeroable, Subtarget, DAG))
13469	return Shift;
13470
13471	// Try to use VALIGN.
13472	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13473	Mask, Subtarget, DAG))
13474	return Rotate;
13475
13476	// Try to use PALIGNR.
13477	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13478	Mask, Subtarget, DAG))
13479	return Rotate;
13480
13481	if (SDValue Unpck =
13482	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13483	return Unpck;
13484	// If we have AVX512F support, we can use VEXPAND.
13485	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13486	V2, DAG, Subtarget))
13487	return V;
13488
13489	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13490	Zeroable, Subtarget, DAG))
13491	return Blend;
13492
13493	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13494	}
13495
13496	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13497	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13498	const APInt &Zeroable,
13499	SDValue V1, SDValue V2,
13500	const X86Subtarget &Subtarget,
13501	SelectionDAG &DAG) {
13502	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13502, __PRETTY_FUNCTION__));
13503	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13503, __PRETTY_FUNCTION__));
13504	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13504, __PRETTY_FUNCTION__));
13505
13506	// Whenever we can lower this as a zext, that instruction is strictly faster
13507	// than any alternative. It also allows us to fold memory operands into the
13508	// shuffle in many cases.
13509	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13510	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13511	return ZExt;
13512
13513	// If the shuffle mask is repeated in each 128-bit lane we can use more
13514	// efficient instructions that mirror the shuffles across the four 128-bit
13515	// lanes.
13516	SmallVector<int, 4> RepeatedMask;
13517	bool Is128BitLaneRepeatedShuffle =
13518	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13519	if (Is128BitLaneRepeatedShuffle) {
13520	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13520, __PRETTY_FUNCTION__));
13521	if (V2.isUndef())
13522	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13523	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13524
13525	// Use dedicated unpack instructions for masks that match their pattern.
13526	if (SDValue V =
13527	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13528	return V;
13529	}
13530
13531	// Try to use shift instructions.
13532	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13533	Zeroable, Subtarget, DAG))
13534	return Shift;
13535
13536	// Try to use VALIGN.
13537	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13538	Mask, Subtarget, DAG))
13539	return Rotate;
13540
13541	// Try to use byte rotation instructions.
13542	if (Subtarget.hasBWI())
13543	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13544	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13545	return Rotate;
13546
13547	// Assume that a single SHUFPS is faster than using a permv shuffle.
13548	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
13549	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13550	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13551	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13552	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13553	CastV1, CastV2, DAG);
13554	return DAG.getBitcast(MVT::v16i32, ShufPS);
13555	}
13556	// If we have AVX512F support, we can use VEXPAND.
13557	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13558	V1, V2, DAG, Subtarget))
13559	return V;
13560
13561	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13562	Zeroable, Subtarget, DAG))
13563	return Blend;
13564	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13565	}
13566
13567	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13568	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13569	const APInt &Zeroable,
13570	SDValue V1, SDValue V2,
13571	const X86Subtarget &Subtarget,
13572	SelectionDAG &DAG) {
13573	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13573, __PRETTY_FUNCTION__));
13574	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13574, __PRETTY_FUNCTION__));
13575	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13575, __PRETTY_FUNCTION__));
13576	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13576, __PRETTY_FUNCTION__));
13577
13578	// Whenever we can lower this as a zext, that instruction is strictly faster
13579	// than any alternative. It also allows us to fold memory operands into the
13580	// shuffle in many cases.
13581	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13582	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13583	return ZExt;
13584
13585	// Use dedicated unpack instructions for masks that match their pattern.
13586	if (SDValue V =
13587	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13588	return V;
13589
13590	// Try to use shift instructions.
13591	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13592	Zeroable, Subtarget, DAG))
13593	return Shift;
13594
13595	// Try to use byte rotation instructions.
13596	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13597	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13598	return Rotate;
13599
13600	if (V2.isUndef()) {
13601	SmallVector<int, 8> RepeatedMask;
13602	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13603	// As this is a single-input shuffle, the repeated mask should be
13604	// a strictly valid v8i16 mask that we can pass through to the v8i16
13605	// lowering to handle even the v32 case.
13606	return lowerV8I16GeneralSingleInputVectorShuffle(
13607	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13608	}
13609	}
13610
13611	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13612	Zeroable, Subtarget, DAG))
13613	return Blend;
13614
13615	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13616	}
13617
13618	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13619	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13620	const APInt &Zeroable,
13621	SDValue V1, SDValue V2,
13622	const X86Subtarget &Subtarget,
13623	SelectionDAG &DAG) {
13624	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13624, __PRETTY_FUNCTION__));
13625	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!" ) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13625, __PRETTY_FUNCTION__));
13626	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!" ) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13626, __PRETTY_FUNCTION__));
13627	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13627, __PRETTY_FUNCTION__));
13628
13629	// Whenever we can lower this as a zext, that instruction is strictly faster
13630	// than any alternative. It also allows us to fold memory operands into the
13631	// shuffle in many cases.
13632	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13633	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13634	return ZExt;
13635
13636	// Use dedicated unpack instructions for masks that match their pattern.
13637	if (SDValue V =
13638	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13639	return V;
13640
13641	// Try to use shift instructions.
13642	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13643	Zeroable, Subtarget, DAG))
13644	return Shift;
13645
13646	// Try to use byte rotation instructions.
13647	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13648	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13649	return Rotate;
13650
13651	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13652	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13653	return PSHUFB;
13654
13655	// VBMI can use VPERMV/VPERMV3 byte shuffles.
13656	if (Subtarget.hasVBMI())
13657	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13658
13659	// Try to create an in-lane repeating shuffle mask and then shuffle the
13660	// the results into the target lanes.
13661	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13662	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13663	return V;
13664
13665	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13666	Zeroable, Subtarget, DAG))
13667	return Blend;
13668
13669	// FIXME: Implement direct support for this type!
13670	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13671	}
13672
13673	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13674	///
13675	/// This routine either breaks down the specific type of a 512-bit x86 vector
13676	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
13677	/// together based on the available instructions.
13678	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13679	MVT VT, SDValue V1, SDValue V2,
13680	const APInt &Zeroable,
13681	const X86Subtarget &Subtarget,
13682	SelectionDAG &DAG) {
13683	assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13684, __PRETTY_FUNCTION__))
13684	"Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13684, __PRETTY_FUNCTION__));
13685
13686	// If we have a single input to the zero element, insert that into V1 if we
13687	// can do so cheaply.
13688	int NumElts = Mask.size();
13689	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13690
13691	if (NumV2Elements == 1 && Mask[0] >= NumElts)
13692	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13693	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13694	return Insertion;
13695
13696	// Check for being able to broadcast a single element.
13697	if (SDValue Broadcast =
13698	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13699	return Broadcast;
13700
13701	// Dispatch to each element type for lowering. If we don't have support for
13702	// specific element type shuffles at 512 bits, immediately split them and
13703	// lower them. Each lowering routine of a given type is allowed to assume that
13704	// the requisite ISA extensions for that element type are available.
13705	switch (VT.SimpleTy) {
13706	case MVT::v8f64:
13707	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13708	case MVT::v16f32:
13709	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13710	case MVT::v8i64:
13711	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13712	case MVT::v16i32:
13713	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13714	case MVT::v32i16:
13715	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13716	case MVT::v64i8:
13717	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13718
13719	default:
13720	llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13720);
13721	}
13722	}
13723
13724	// Lower vXi1 vector shuffles.
13725	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
13726	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
13727	// vector, shuffle and then truncate it back.
13728	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13729	MVT VT, SDValue V1, SDValue V2,
13730	const X86Subtarget &Subtarget,
13731	SelectionDAG &DAG) {
13732	assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13733, __PRETTY_FUNCTION__))
13733	"Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13733, __PRETTY_FUNCTION__));
13734	MVT ExtVT;
13735	switch (VT.SimpleTy) {
13736	default:
13737	llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13737);
13738	case MVT::v2i1:
13739	ExtVT = MVT::v2i64;
13740	break;
13741	case MVT::v4i1:
13742	ExtVT = MVT::v4i32;
13743	break;
13744	case MVT::v8i1:
13745	ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13746	break;
13747	case MVT::v16i1:
13748	ExtVT = MVT::v16i32;
13749	break;
13750	case MVT::v32i1:
13751	ExtVT = MVT::v32i16;
13752	break;
13753	case MVT::v64i1:
13754	ExtVT = MVT::v64i8;
13755	break;
13756	}
13757
13758	if (ISD::isBuildVectorAllZeros(V1.getNode()))
13759	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13760	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13761	V1 = getOnesVector(ExtVT, DAG, DL);
13762	else
13763	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13764
13765	if (V2.isUndef())
13766	V2 = DAG.getUNDEF(ExtVT);
13767	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13768	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13769	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13770	V2 = getOnesVector(ExtVT, DAG, DL);
13771	else
13772	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13773
13774	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13775	// i1 was sign extended we can use X86ISD::CVT2MASK.
13776	int NumElems = VT.getVectorNumElements();
13777	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
13778	(Subtarget.hasDQI() && (NumElems < 32)))
13779	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13780
13781	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13782	}
13783
13784	/// Helper function that returns true if the shuffle mask should be
13785	/// commuted to improve canonicalization.
13786	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13787	int NumElements = Mask.size();
13788
13789	int NumV1Elements = 0, NumV2Elements = 0;
13790	for (int M : Mask)
13791	if (M < 0)
13792	continue;
13793	else if (M < NumElements)
13794	++NumV1Elements;
13795	else
13796	++NumV2Elements;
13797
13798	// Commute the shuffle as needed such that more elements come from V1 than
13799	// V2. This allows us to match the shuffle pattern strictly on how many
13800	// elements come from V1 without handling the symmetric cases.
13801	if (NumV2Elements > NumV1Elements)
13802	return true;
13803
13804	assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast <void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13804, __PRETTY_FUNCTION__));
13805
13806	if (NumV2Elements == 0)
13807	return false;
13808
13809	// When the number of V1 and V2 elements are the same, try to minimize the
13810	// number of uses of V2 in the low half of the vector. When that is tied,
13811	// ensure that the sum of indices for V1 is equal to or lower than the sum
13812	// indices for V2. When those are equal, try to ensure that the number of odd
13813	// indices for V1 is lower than the number of odd indices for V2.
13814	if (NumV1Elements == NumV2Elements) {
13815	int LowV1Elements = 0, LowV2Elements = 0;
13816	for (int M : Mask.slice(0, NumElements / 2))
13817	if (M >= NumElements)
13818	++LowV2Elements;
13819	else if (M >= 0)
13820	++LowV1Elements;
13821	if (LowV2Elements > LowV1Elements)
13822	return true;
13823	if (LowV2Elements == LowV1Elements) {
13824	int SumV1Indices = 0, SumV2Indices = 0;
13825	for (int i = 0, Size = Mask.size(); i < Size; ++i)
13826	if (Mask[i] >= NumElements)
13827	SumV2Indices += i;
13828	else if (Mask[i] >= 0)
13829	SumV1Indices += i;
13830	if (SumV2Indices < SumV1Indices)
13831	return true;
13832	if (SumV2Indices == SumV1Indices) {
13833	int NumV1OddIndices = 0, NumV2OddIndices = 0;
13834	for (int i = 0, Size = Mask.size(); i < Size; ++i)
13835	if (Mask[i] >= NumElements)
13836	NumV2OddIndices += i % 2;
13837	else if (Mask[i] >= 0)
13838	NumV1OddIndices += i % 2;
13839	if (NumV2OddIndices < NumV1OddIndices)
13840	return true;
13841	}
13842	}
13843	}
13844
13845	return false;
13846	}
13847
13848	/// \brief Top-level lowering for x86 vector shuffles.
13849	///
13850	/// This handles decomposition, canonicalization, and lowering of all x86
13851	/// vector shuffles. Most of the specific lowering strategies are encapsulated
13852	/// above in helper routines. The canonicalization attempts to widen shuffles
13853	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
13854	/// s.t. only one of the two inputs needs to be tested, etc.
13855	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13856	SelectionDAG &DAG) {
13857	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13858	ArrayRef<int> Mask = SVOp->getMask();
13859	SDValue V1 = Op.getOperand(0);
13860	SDValue V2 = Op.getOperand(1);
13861	MVT VT = Op.getSimpleValueType();
13862	int NumElements = VT.getVectorNumElements();
13863	SDLoc DL(Op);
13864	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13865
13866	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&(((VT.getSizeInBits() != 64 \|\| Is1BitVector) && "Can't lower MMX shuffles" ) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 \|\| Is1BitVector) && \"Can't lower MMX shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13867, __PRETTY_FUNCTION__))
13867	"Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 \|\| Is1BitVector) && "Can't lower MMX shuffles" ) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 \|\| Is1BitVector) && \"Can't lower MMX shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13867, __PRETTY_FUNCTION__));
13868
13869	bool V1IsUndef = V1.isUndef();
13870	bool V2IsUndef = V2.isUndef();
13871	if (V1IsUndef && V2IsUndef)
13872	return DAG.getUNDEF(VT);
13873
13874	// When we create a shuffle node we put the UNDEF node to second operand,
13875	// but in some cases the first operand may be transformed to UNDEF.
13876	// In this case we should just commute the node.
13877	if (V1IsUndef)
13878	return DAG.getCommutedVectorShuffle(*SVOp);
13879
13880	// Check for non-undef masks pointing at an undef vector and make the masks
13881	// undef as well. This makes it easier to match the shuffle based solely on
13882	// the mask.
13883	if (V2IsUndef)
13884	for (int M : Mask)
13885	if (M >= NumElements) {
13886	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13887	for (int &M : NewMask)
13888	if (M >= NumElements)
13889	M = -1;
13890	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13891	}
13892
13893	// Check for illegal shuffle mask element index values.
13894	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13895	assert(llvm::all_of(Mask,((llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13897, __PRETTY_FUNCTION__))
13896	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13897, __PRETTY_FUNCTION__))
13897	"Out of bounds shuffle index")((llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13897, __PRETTY_FUNCTION__));
13898
13899	// We actually see shuffles that are entirely re-arrangements of a set of
13900	// zero inputs. This mostly happens while decomposing complex shuffles into
13901	// simple ones. Directly lower these as a buildvector of zeros.
13902	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13903	if (Zeroable.isAllOnesValue())
13904	return getZeroVector(VT, Subtarget, DAG, DL);
13905
13906	// Try to collapse shuffles into using a vector type with fewer elements but
13907	// wider element types. We cap this to not form integers or floating point
13908	// elements wider than 64 bits, but it might be interesting to form i128
13909	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
13910	SmallVector<int, 16> WidenedMask;
13911	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13912	canWidenShuffleElements(Mask, WidenedMask)) {
13913	MVT NewEltVT = VT.isFloatingPoint()
13914	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13915	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13916	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13917	// Make sure that the new vector type is legal. For example, v2f64 isn't
13918	// legal on SSE1.
13919	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13920	V1 = DAG.getBitcast(NewVT, V1);
13921	V2 = DAG.getBitcast(NewVT, V2);
13922	return DAG.getBitcast(
13923	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13924	}
13925	}
13926
13927	// Commute the shuffle if it will improve canonicalization.
13928	if (canonicalizeShuffleMaskWithCommute(Mask))
13929	return DAG.getCommutedVectorShuffle(*SVOp);
13930
13931	// For each vector width, delegate to a specialized lowering routine.
13932	if (VT.is128BitVector())
13933	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13934	DAG);
13935
13936	if (VT.is256BitVector())
13937	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13938	DAG);
13939
13940	if (VT.is512BitVector())
13941	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13942	DAG);
13943
13944	if (Is1BitVector)
13945	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13946
13947	llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 13947);
13948	}
13949
13950	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
13951	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13952	const X86Subtarget &Subtarget,
13953	SelectionDAG &DAG) {
13954	SDValue Cond = Op.getOperand(0);
13955	SDValue LHS = Op.getOperand(1);
13956	SDValue RHS = Op.getOperand(2);
13957	SDLoc dl(Op);
13958	MVT VT = Op.getSimpleValueType();
13959
13960	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13961	return SDValue();
13962	auto *CondBV = cast<BuildVectorSDNode>(Cond);
13963
13964	// Only non-legal VSELECTs reach this lowering, convert those into generic
13965	// shuffles and re-use the shuffle lowering path for blends.
13966	SmallVector<int, 32> Mask;
13967	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13968	SDValue CondElt = CondBV->getOperand(i);
13969	Mask.push_back(
13970	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13971	: -1);
13972	}
13973	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13974	}
13975
13976	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13977	// A vselect where all conditions and data are constants can be optimized into
13978	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13979	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13980	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13981	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13982	return SDValue();
13983
13984	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
13985	// with patterns on the mask registers on AVX-512.
13986	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13987	return Op;
13988
13989	// Try to lower this to a blend-style vector shuffle. This can handle all
13990	// constant condition cases.
13991	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13992	return BlendOp;
13993
13994	// Variable blends are only legal from SSE4.1 onward.
13995	if (!Subtarget.hasSSE41())
13996	return SDValue();
13997
13998	SDLoc dl(Op);
13999	MVT VT = Op.getSimpleValueType();
14000
14001	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14002	// into an i1 condition so that we can use the mask-based 512-bit blend
14003	// instructions.
14004	if (VT.getSizeInBits() == 512) {
14005	SDValue Cond = Op.getOperand(0);
14006	// The vNi1 condition case should be handled above as it can be trivially
14007	// lowered.
14008	assert(Cond.getValueType().getScalarSizeInBits() ==((Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits () && "Should have a size-matched integer condition!" ) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14010, __PRETTY_FUNCTION__))
14009	VT.getScalarSizeInBits() &&((Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits () && "Should have a size-matched integer condition!" ) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14010, __PRETTY_FUNCTION__))
14010	"Should have a size-matched integer condition!")((Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits () && "Should have a size-matched integer condition!" ) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14010, __PRETTY_FUNCTION__));
14011	// Build a mask by testing the condition against itself (tests for zero).
14012	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14013	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14014	// Now return a new VSELECT using the mask.
14015	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14016	}
14017
14018	// Only some types will be legal on some subtargets. If we can emit a legal
14019	// VSELECT-matching blend, return Op, and but if we need to expand, return
14020	// a null value.
14021	switch (VT.SimpleTy) {
14022	default:
14023	// Most of the vector types have blends past SSE4.1.
14024	return Op;
14025
14026	case MVT::v32i8:
14027	// The byte blends for AVX vectors were introduced only in AVX2.
14028	if (Subtarget.hasAVX2())
14029	return Op;
14030
14031	return SDValue();
14032
14033	case MVT::v8i16:
14034	case MVT::v16i16:
14035	// AVX-512 BWI and VLX features support VSELECT with i16 elements.
14036	if (Subtarget.hasBWI() && Subtarget.hasVLX())
14037	return Op;
14038
14039	// FIXME: We should custom lower this by fixing the condition and using i8
14040	// blends.
14041	return SDValue();
14042	}
14043	}
14044
14045	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14046	MVT VT = Op.getSimpleValueType();
14047	SDLoc dl(Op);
14048
14049	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14050	return SDValue();
14051
14052	if (VT.getSizeInBits() == 8) {
14053	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14054	Op.getOperand(0), Op.getOperand(1));
14055	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14056	DAG.getValueType(VT));
14057	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14058	}
14059
14060	if (VT == MVT::f32) {
14061	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14062	// the result back to FR32 register. It's only worth matching if the
14063	// result has a single use which is a store or a bitcast to i32. And in
14064	// the case of a store, it's not worth it if the index is a constant 0,
14065	// because a MOVSSmr can be used instead, which is smaller and faster.
14066	if (!Op.hasOneUse())
14067	return SDValue();
14068	SDNode User = Op.getNode()->use_begin();
14069	if ((User->getOpcode() != ISD::STORE \|\|
14070	isNullConstant(Op.getOperand(1))) &&
14071	(User->getOpcode() != ISD::BITCAST \|\|
14072	User->getValueType(0) != MVT::i32))
14073	return SDValue();
14074	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14075	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14076	Op.getOperand(1));
14077	return DAG.getBitcast(MVT::f32, Extract);
14078	}
14079
14080	if (VT == MVT::i32 \|\| VT == MVT::i64) {
14081	// ExtractPS/pextrq works with constant index.
14082	if (isa<ConstantSDNode>(Op.getOperand(1)))
14083	return Op;
14084	}
14085
14086	return SDValue();
14087	}
14088
14089	/// Extract one bit from mask vector, like v16i1 or v8i1.
14090	/// AVX-512 feature.
14091	SDValue
14092	X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14093	SDValue Vec = Op.getOperand(0);
14094	SDLoc dl(Vec);
14095	MVT VecVT = Vec.getSimpleValueType();
14096	SDValue Idx = Op.getOperand(1);
14097	MVT EltVT = Op.getSimpleValueType();
14098
14099	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI( )) && "Unexpected vector type in ExtractBitFromMaskVector" ) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14100, __PRETTY_FUNCTION__))
14100	"Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI( )) && "Unexpected vector type in ExtractBitFromMaskVector" ) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14100, __PRETTY_FUNCTION__));
14101
14102	// variable index can't be handled in mask registers,
14103	// extend vector to VR512/128
14104	if (!isa<ConstantSDNode>(Idx)) {
14105	unsigned NumElts = VecVT.getVectorNumElements();
14106	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
14107	// than extending to 128/256bit.
14108	unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14109	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14110	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14111	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14112	ExtVT.getVectorElementType(), Ext, Idx);
14113	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14114	}
14115
14116	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14117	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
14118	(VecVT.getVectorNumElements() < 8)) {
14119	// Use kshiftlw/rw instruction.
14120	VecVT = MVT::v16i1;
14121	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14122	DAG.getUNDEF(VecVT),
14123	Vec,
14124	DAG.getIntPtrConstant(0, dl));
14125	}
14126	unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14127	if (MaxSift - IdxVal)
14128	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14129	DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14130	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14131	DAG.getConstant(MaxSift, dl, MVT::i8));
14132	return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14133	DAG.getIntPtrConstant(0, dl));
14134	}
14135
14136	SDValue
14137	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14138	SelectionDAG &DAG) const {
14139	SDLoc dl(Op);
14140	SDValue Vec = Op.getOperand(0);
14141	MVT VecVT = Vec.getSimpleValueType();
14142	SDValue Idx = Op.getOperand(1);
14143
14144	if (VecVT.getVectorElementType() == MVT::i1)
14145	return ExtractBitFromMaskVector(Op, DAG);
14146
14147	if (!isa<ConstantSDNode>(Idx)) {
14148	// Its more profitable to go through memory (1 cycles throughput)
14149	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14150	// IACA tool was used to get performance estimation
14151	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14152	//
14153	// example : extractelement <16 x i8> %a, i32 %i
14154	//
14155	// Block Throughput: 3.00 Cycles
14156	// Throughput Bottleneck: Port5
14157	//
14158	// \| Num Of \| Ports pressure in cycles \| \|
14159	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
14160	// ---------------------------------------------
14161	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
14162	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
14163	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
14164	// Total Num Of Uops: 4
14165	//
14166	//
14167	// Block Throughput: 1.00 Cycles
14168	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14169	//
14170	// \| \| Ports pressure in cycles \| \|
14171	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
14172	// ---------------------------------------------------------
14173	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
14174	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
14175	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
14176	// Total Num Of Uops: 4
14177
14178	return SDValue();
14179	}
14180
14181	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14182
14183	// If this is a 256-bit vector result, first extract the 128-bit vector and
14184	// then extract the element from the 128-bit vector.
14185	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
14186	// Get the 128-bit vector.
14187	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14188	MVT EltVT = VecVT.getVectorElementType();
14189
14190	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14191	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14191, __PRETTY_FUNCTION__));
14192
14193	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14194	// this can be done with a mask.
14195	IdxVal &= ElemsPerChunk - 1;
14196	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14197	DAG.getConstant(IdxVal, dl, MVT::i32));
14198	}
14199
14200	assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length" ) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14200, __PRETTY_FUNCTION__));
14201
14202	MVT VT = Op.getSimpleValueType();
14203
14204	if (VT.getSizeInBits() == 16) {
14205	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14206	// we're going to zero extend the register or fold the store (SSE41 only).
14207	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14208	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14209	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14210	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14211	DAG.getBitcast(MVT::v4i32, Vec), Idx));
14212
14213	// Transform it so it match pextrw which produces a 32-bit result.
14214	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14215	Op.getOperand(0), Op.getOperand(1));
14216	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14217	DAG.getValueType(VT));
14218	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14219	}
14220
14221	if (Subtarget.hasSSE41())
14222	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14223	return Res;
14224
14225	// TODO: We only extract a single element from v16i8, we can probably afford
14226	// to be more aggressive here before using the default approach of spilling to
14227	// stack.
14228	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14229	// Extract either the lowest i32 or any i16, and extract the sub-byte.
14230	int DWordIdx = IdxVal / 4;
14231	if (DWordIdx == 0) {
14232	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14233	DAG.getBitcast(MVT::v4i32, Vec),
14234	DAG.getIntPtrConstant(DWordIdx, dl));
14235	int ShiftVal = (IdxVal % 4) * 8;
14236	if (ShiftVal != 0)
14237	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14238	DAG.getConstant(ShiftVal, dl, MVT::i32));
14239	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14240	}
14241
14242	int WordIdx = IdxVal / 2;
14243	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14244	DAG.getBitcast(MVT::v8i16, Vec),
14245	DAG.getIntPtrConstant(WordIdx, dl));
14246	int ShiftVal = (IdxVal % 2) * 8;
14247	if (ShiftVal != 0)
14248	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14249	DAG.getConstant(ShiftVal, dl, MVT::i16));
14250	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14251	}
14252
14253	if (VT.getSizeInBits() == 32) {
14254	if (IdxVal == 0)
14255	return Op;
14256
14257	// SHUFPS the element to the lowest double word, then movss.
14258	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14259	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14260	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14261	DAG.getIntPtrConstant(0, dl));
14262	}
14263
14264	if (VT.getSizeInBits() == 64) {
14265	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14266	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14267	// to match extract_elt for f64.
14268	if (IdxVal == 0)
14269	return Op;
14270
14271	// UNPCKHPD the element to the lowest double word, then movsd.
14272	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14273	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
14274	int Mask[2] = { 1, -1 };
14275	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14276	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14277	DAG.getIntPtrConstant(0, dl));
14278	}
14279
14280	return SDValue();
14281	}
14282
14283	/// Insert one bit to mask vector, like v16i1 or v8i1.
14284	/// AVX-512 feature.
14285	SDValue
14286	X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14287	SDLoc dl(Op);
14288	SDValue Vec = Op.getOperand(0);
14289	SDValue Elt = Op.getOperand(1);
14290	SDValue Idx = Op.getOperand(2);
14291	MVT VecVT = Vec.getSimpleValueType();
14292
14293	if (!isa<ConstantSDNode>(Idx)) {
14294	// Non constant index. Extend source and destination,
14295	// insert element and then truncate the result.
14296	MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14297	MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14298	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14299	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14300	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14301	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14302	}
14303
14304	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14305	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14306	unsigned NumElems = VecVT.getVectorNumElements();
14307
14308	if(Vec.isUndef()) {
14309	if (IdxVal)
14310	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14311	DAG.getConstant(IdxVal, dl, MVT::i8));
14312	return EltInVec;
14313	}
14314
14315	// Insertion of one bit into first position
14316	if (IdxVal == 0 ) {
14317	// Clean top bits of vector.
14318	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14319	DAG.getConstant(NumElems - 1, dl, MVT::i8));
14320	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14321	DAG.getConstant(NumElems - 1, dl, MVT::i8));
14322	// Clean the first bit in source vector.
14323	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14324	DAG.getConstant(1 , dl, MVT::i8));
14325	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14326	DAG.getConstant(1, dl, MVT::i8));
14327
14328	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14329	}
14330	// Insertion of one bit into last position
14331	if (IdxVal == NumElems -1) {
14332	// Move the bit to the last position inside the vector.
14333	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14334	DAG.getConstant(IdxVal, dl, MVT::i8));
14335	// Clean the last bit in the source vector.
14336	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14337	DAG.getConstant(1, dl, MVT::i8));
14338	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14339	DAG.getConstant(1 , dl, MVT::i8));
14340
14341	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14342	}
14343
14344	// Use shuffle to insert element.
14345	SmallVector<int, 64> MaskVec(NumElems);
14346	for (unsigned i = 0; i != NumElems; ++i)
14347	MaskVec[i] = (i == IdxVal) ? NumElems : i;
14348
14349	return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14350	}
14351
14352	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14353	SelectionDAG &DAG) const {
14354	MVT VT = Op.getSimpleValueType();
14355	MVT EltVT = VT.getVectorElementType();
14356	unsigned NumElts = VT.getVectorNumElements();
14357
14358	if (EltVT == MVT::i1)
14359	return InsertBitToMaskVector(Op, DAG);
14360
14361	SDLoc dl(Op);
14362	SDValue N0 = Op.getOperand(0);
14363	SDValue N1 = Op.getOperand(1);
14364	SDValue N2 = Op.getOperand(2);
14365	if (!isa<ConstantSDNode>(N2))
14366	return SDValue();
14367	auto *N2C = cast<ConstantSDNode>(N2);
14368	unsigned IdxVal = N2C->getZExtValue();
14369
14370	bool IsZeroElt = X86::isZeroNode(N1);
14371	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14372
14373	// If we are inserting a element, see if we can do this more efficiently with
14374	// a blend shuffle with a rematerializable vector than a costly integer
14375	// insertion.
14376	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
14377	16 <= EltVT.getSizeInBits()) {
14378	SmallVector<int, 8> BlendMask;
14379	for (unsigned i = 0; i != NumElts; ++i)
14380	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14381	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14382	: DAG.getConstant(-1, dl, VT);
14383	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14384	}
14385
14386	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14387	// into that, and then insert the subvector back into the result.
14388	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
14389	// With a 256-bit vector, we can insert into the zero element efficiently
14390	// using a blend if we have AVX or AVX2 and the right data type.
14391	if (VT.is256BitVector() && IdxVal == 0) {
14392	// TODO: It is worthwhile to cast integer to floating point and back
14393	// and incur a domain crossing penalty if that's what we'll end up
14394	// doing anyway after extracting to a 128-bit vector.
14395	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
14396	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14397	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14398	N2 = DAG.getIntPtrConstant(1, dl);
14399	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14400	}
14401	}
14402
14403	// Get the desired 128-bit vector chunk.
14404	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14405
14406	// Insert the element into the desired chunk.
14407	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14408	assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14408, __PRETTY_FUNCTION__));
14409	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14410	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14411
14412	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14413	DAG.getConstant(IdxIn128, dl, MVT::i32));
14414
14415	// Insert the changed part back into the bigger vector
14416	return insert128BitVector(N0, V, IdxVal, DAG, dl);
14417	}
14418	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14418, __PRETTY_FUNCTION__));
14419
14420	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
14421	// argument. SSE41 required for pinsrb.
14422	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14423	unsigned Opc;
14424	if (VT == MVT::v8i16) {
14425	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14425, __PRETTY_FUNCTION__));
14426	Opc = X86ISD::PINSRW;
14427	} else {
14428	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector") ? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14428, __PRETTY_FUNCTION__));
14429	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14429, __PRETTY_FUNCTION__));
14430	Opc = X86ISD::PINSRB;
14431	}
14432
14433	if (N1.getValueType() != MVT::i32)
14434	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14435	if (N2.getValueType() != MVT::i32)
14436	N2 = DAG.getIntPtrConstant(IdxVal, dl);
14437	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14438	}
14439
14440	if (Subtarget.hasSSE41()) {
14441	if (EltVT == MVT::f32) {
14442	// Bits [7:6] of the constant are the source select. This will always be
14443	// zero here. The DAG Combiner may combine an extract_elt index into
14444	// these bits. For example (insert (extract, 3), 2) could be matched by
14445	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14446	// Bits [5:4] of the constant are the destination select. This is the
14447	// value of the incoming immediate.
14448	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14449	// combine either bitwise AND or insert of float 0.0 to set these bits.
14450
14451	bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14452	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
14453	// If this is an insertion of 32-bits into the low 32-bits of
14454	// a vector, we prefer to generate a blend with immediate rather
14455	// than an insertps. Blends are simpler operations in hardware and so
14456	// will always have equal or better performance than insertps.
14457	// But if optimizing for size and there's a load folding opportunity,
14458	// generate insertps because blendps does not have a 32-bit memory
14459	// operand form.
14460	N2 = DAG.getIntPtrConstant(1, dl);
14461	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14462	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14463	}
14464	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14465	// Create this as a scalar to vector..
14466	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14467	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14468	}
14469
14470	// PINSR* works with constant index.
14471	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
14472	return Op;
14473	}
14474
14475	return SDValue();
14476	}
14477
14478	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14479	SelectionDAG &DAG) {
14480	SDLoc dl(Op);
14481	MVT OpVT = Op.getSimpleValueType();
14482
14483	// It's always cheaper to replace a xor+movd with xorps and simplifies further
14484	// combines.
14485	if (X86::isZeroNode(Op.getOperand(0)))
14486	return getZeroVector(OpVT, Subtarget, DAG, dl);
14487
14488	// If this is a 256-bit vector result, first insert into a 128-bit
14489	// vector and then insert into the 256-bit vector.
14490	if (!OpVT.is128BitVector()) {
14491	// Insert into a 128-bit vector.
14492	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14493	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14494	OpVT.getVectorNumElements() / SizeFactor);
14495
14496	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14497
14498	// Insert the 128-bit vector.
14499	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14500	}
14501	assert(OpVT.is128BitVector() && "Expected an SSE type!")((OpVT.is128BitVector() && "Expected an SSE type!") ? static_cast<void> (0) : __assert_fail ("OpVT.is128BitVector() && \"Expected an SSE type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14501, __PRETTY_FUNCTION__));
14502
14503	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14504	if (OpVT == MVT::v4i32)
14505	return Op;
14506
14507	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14508	return DAG.getBitcast(
14509	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14510	}
14511
14512	// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14513	// a simple subregister reference or explicit instructions to grab
14514	// upper bits of a vector.
14515	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14516	SelectionDAG &DAG) {
14517	assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX")((Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"EXTRACT_SUBVECTOR requires AVX\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14517, __PRETTY_FUNCTION__));
14518
14519	SDLoc dl(Op);
14520	SDValue In = Op.getOperand(0);
14521	SDValue Idx = Op.getOperand(1);
14522	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14523	MVT ResVT = Op.getSimpleValueType();
14524
14525	assert((In.getSimpleValueType().is256BitVector() \|\|(((In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType ().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors" ) ? static_cast<void> (0) : __assert_fail ("(In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType().is512BitVector()) && \"Can only extract from 256-bit or 512-bit vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14527, __PRETTY_FUNCTION__))
14526	In.getSimpleValueType().is512BitVector()) &&(((In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType ().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors" ) ? static_cast<void> (0) : __assert_fail ("(In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType().is512BitVector()) && \"Can only extract from 256-bit or 512-bit vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14527, __PRETTY_FUNCTION__))
14527	"Can only extract from 256-bit or 512-bit vectors")(((In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType ().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors" ) ? static_cast<void> (0) : __assert_fail ("(In.getSimpleValueType().is256BitVector() \|\| In.getSimpleValueType().is512BitVector()) && \"Can only extract from 256-bit or 512-bit vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14527, __PRETTY_FUNCTION__));
14528
14529	// If the input is a buildvector just emit a smaller one.
14530	unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14531	if (In.getOpcode() == ISD::BUILD_VECTOR)
14532	return DAG.getBuildVector(
14533	ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14534
14535	// Everything else is legal.
14536	return Op;
14537	}
14538
14539	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14540	// simple superregister reference or explicit instructions to insert
14541	// the upper bits of a vector.
14542	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14543	SelectionDAG &DAG) {
14544	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14544, __PRETTY_FUNCTION__));
14545
14546	return insert1BitVector(Op, DAG, Subtarget);
14547	}
14548
14549	// Returns the appropriate wrapper opcode for a global reference.
14550	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14551	// References to absolute symbols are never PC-relative.
14552	if (GV && GV->isAbsoluteSymbolRef())
14553	return X86ISD::Wrapper;
14554
14555	CodeModel::Model M = getTargetMachine().getCodeModel();
14556	if (Subtarget.isPICStyleRIPRel() &&
14557	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
14558	return X86ISD::WrapperRIP;
14559
14560	return X86ISD::Wrapper;
14561	}
14562
14563	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14564	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14565	// one of the above mentioned nodes. It has to be wrapped because otherwise
14566	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14567	// be used to form addressing mode. These wrapped nodes will be selected
14568	// into MOV32ri.
14569	SDValue
14570	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14571	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14572
14573	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14574	// global base reg.
14575	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14576
14577	auto PtrVT = getPointerTy(DAG.getDataLayout());
14578	SDValue Result = DAG.getTargetConstantPool(
14579	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14580	SDLoc DL(CP);
14581	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14582	// With PIC, the address is actually $g + Offset.
14583	if (OpFlag) {
14584	Result =
14585	DAG.getNode(ISD::ADD, DL, PtrVT,
14586	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14587	}
14588
14589	return Result;
14590	}
14591
14592	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14593	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14594
14595	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14596	// global base reg.
14597	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14598
14599	auto PtrVT = getPointerTy(DAG.getDataLayout());
14600	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14601	SDLoc DL(JT);
14602	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14603
14604	// With PIC, the address is actually $g + Offset.
14605	if (OpFlag)
14606	Result =
14607	DAG.getNode(ISD::ADD, DL, PtrVT,
14608	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14609
14610	return Result;
14611	}
14612
14613	SDValue
14614	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14615	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14616
14617	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14618	// global base reg.
14619	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14620	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14621
14622	auto PtrVT = getPointerTy(DAG.getDataLayout());
14623	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14624
14625	SDLoc DL(Op);
14626	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14627
14628	// With PIC, the address is actually $g + Offset.
14629	if (isPositionIndependent() && !Subtarget.is64Bit()) {
14630	Result =
14631	DAG.getNode(ISD::ADD, DL, PtrVT,
14632	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14633	}
14634
14635	// For symbols that require a load from a stub to get the address, emit the
14636	// load.
14637	if (isGlobalStubReference(OpFlag))
14638	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14639	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14640
14641	return Result;
14642	}
14643
14644	SDValue
14645	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14646	// Create the TargetBlockAddressAddress node.
14647	unsigned char OpFlags =
14648	Subtarget.classifyBlockAddressReference();
14649	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14650	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14651	SDLoc dl(Op);
14652	auto PtrVT = getPointerTy(DAG.getDataLayout());
14653	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14654	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14655
14656	// With PIC, the address is actually $g + Offset.
14657	if (isGlobalRelativeToPICBase(OpFlags)) {
14658	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14659	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14660	}
14661
14662	return Result;
14663	}
14664
14665	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14666	const SDLoc &dl, int64_t Offset,
14667	SelectionDAG &DAG) const {
14668	// Create the TargetGlobalAddress node, folding in the constant
14669	// offset if it is legal.
14670	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14671	CodeModel::Model M = DAG.getTarget().getCodeModel();
14672	auto PtrVT = getPointerTy(DAG.getDataLayout());
14673	SDValue Result;
14674	if (OpFlags == X86II::MO_NO_FLAG &&
14675	X86::isOffsetSuitableForCodeModel(Offset, M)) {
14676	// A direct static reference to a global.
14677	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14678	Offset = 0;
14679	} else {
14680	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14681	}
14682
14683	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14684
14685	// With PIC, the address is actually $g + Offset.
14686	if (isGlobalRelativeToPICBase(OpFlags)) {
14687	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14688	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14689	}
14690
14691	// For globals that require a load from a stub to get the address, emit the
14692	// load.
14693	if (isGlobalStubReference(OpFlags))
14694	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14695	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14696
14697	// If there was a non-zero offset that we didn't fold, create an explicit
14698	// addition for it.
14699	if (Offset != 0)
14700	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14701	DAG.getConstant(Offset, dl, PtrVT));
14702
14703	return Result;
14704	}
14705
14706	SDValue
14707	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14708	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14709	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14710	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14711	}
14712
14713	static SDValue
14714	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14715	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14716	unsigned char OperandFlags, bool LocalDynamic = false) {
14717	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14718	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14719	SDLoc dl(GA);
14720	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14721	GA->getValueType(0),
14722	GA->getOffset(),
14723	OperandFlags);
14724
14725	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14726	: X86ISD::TLSADDR;
14727
14728	if (InFlag) {
14729	SDValue Ops[] = { Chain, TGA, *InFlag };
14730	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14731	} else {
14732	SDValue Ops[] = { Chain, TGA };
14733	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14734	}
14735
14736	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14737	MFI.setAdjustsStack(true);
14738	MFI.setHasCalls(true);
14739
14740	SDValue Flag = Chain.getValue(1);
14741	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14742	}
14743
14744	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14745	static SDValue
14746	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14747	const EVT PtrVT) {
14748	SDValue InFlag;
14749	SDLoc dl(GA); // ? function entry point might be better
14750	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14751	DAG.getNode(X86ISD::GlobalBaseReg,
14752	SDLoc(), PtrVT), InFlag);
14753	InFlag = Chain.getValue(1);
14754
14755	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14756	}
14757
14758	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14759	static SDValue
14760	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14761	const EVT PtrVT) {
14762	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14763	X86::RAX, X86II::MO_TLSGD);
14764	}
14765
14766	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14767	SelectionDAG &DAG,
14768	const EVT PtrVT,
14769	bool is64Bit) {
14770	SDLoc dl(GA);
14771
14772	// Get the start address of the TLS block for this module.
14773	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14774	.getInfo<X86MachineFunctionInfo>();
14775	MFI->incNumLocalDynamicTLSAccesses();
14776
14777	SDValue Base;
14778	if (is64Bit) {
14779	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14780	X86II::MO_TLSLD, /LocalDynamic=/true);
14781	} else {
14782	SDValue InFlag;
14783	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14784	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14785	InFlag = Chain.getValue(1);
14786	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14787	X86II::MO_TLSLDM, /LocalDynamic=/true);
14788	}
14789
14790	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14791	// of Base.
14792
14793	// Build x@dtpoff.
14794	unsigned char OperandFlags = X86II::MO_DTPOFF;
14795	unsigned WrapperKind = X86ISD::Wrapper;
14796	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14797	GA->getValueType(0),
14798	GA->getOffset(), OperandFlags);
14799	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14800
14801	// Add x@dtpoff with the base.
14802	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14803	}
14804
14805	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14806	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14807	const EVT PtrVT, TLSModel::Model model,
14808	bool is64Bit, bool isPIC) {
14809	SDLoc dl(GA);
14810
14811	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14812	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
14813	is64Bit ? 257 : 256));
14814
14815	SDValue ThreadPointer =
14816	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14817	MachinePointerInfo(Ptr));
14818
14819	unsigned char OperandFlags = 0;
14820	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
14821	// initialexec.
14822	unsigned WrapperKind = X86ISD::Wrapper;
14823	if (model == TLSModel::LocalExec) {
14824	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14825	} else if (model == TLSModel::InitialExec) {
14826	if (is64Bit) {
14827	OperandFlags = X86II::MO_GOTTPOFF;
14828	WrapperKind = X86ISD::WrapperRIP;
14829	} else {
14830	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14831	}
14832	} else {
14833	llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14833);
14834	}
14835
14836	// emit "addl x@ntpoff,%eax" (local exec)
14837	// or "addl x@indntpoff,%eax" (initial exec)
14838	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14839	SDValue TGA =
14840	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14841	GA->getOffset(), OperandFlags);
14842	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14843
14844	if (model == TLSModel::InitialExec) {
14845	if (isPIC && !is64Bit) {
14846	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14847	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14848	Offset);
14849	}
14850
14851	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14852	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14853	}
14854
14855	// The address of the thread local variable is the add of the thread
14856	// pointer with the offset of the variable.
14857	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14858	}
14859
14860	SDValue
14861	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14862
14863	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14864
14865	if (DAG.getTarget().Options.EmulatedTLS)
14866	return LowerToTLSEmulatedModel(GA, DAG);
14867
14868	const GlobalValue *GV = GA->getGlobal();
14869	auto PtrVT = getPointerTy(DAG.getDataLayout());
14870	bool PositionIndependent = isPositionIndependent();
14871
14872	if (Subtarget.isTargetELF()) {
14873	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14874	switch (model) {
14875	case TLSModel::GeneralDynamic:
14876	if (Subtarget.is64Bit())
14877	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14878	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14879	case TLSModel::LocalDynamic:
14880	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14881	Subtarget.is64Bit());
14882	case TLSModel::InitialExec:
14883	case TLSModel::LocalExec:
14884	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14885	PositionIndependent);
14886	}
14887	llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 14887);
14888	}
14889
14890	if (Subtarget.isTargetDarwin()) {
14891	// Darwin only has one model of TLS. Lower to that.
14892	unsigned char OpFlag = 0;
14893	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14894	X86ISD::WrapperRIP : X86ISD::Wrapper;
14895
14896	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14897	// global base reg.
14898	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14899	if (PIC32)
14900	OpFlag = X86II::MO_TLVP_PIC_BASE;
14901	else
14902	OpFlag = X86II::MO_TLVP;
14903	SDLoc DL(Op);
14904	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14905	GA->getValueType(0),
14906	GA->getOffset(), OpFlag);
14907	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14908
14909	// With PIC32, the address is actually $g + Offset.
14910	if (PIC32)
14911	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14912	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14913	Offset);
14914
14915	// Lowering the machine isd will make sure everything is in the right
14916	// location.
14917	SDValue Chain = DAG.getEntryNode();
14918	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14919	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14920	SDValue Args[] = { Chain, Offset };
14921	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14922	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14923	DAG.getIntPtrConstant(0, DL, true),
14924	Chain.getValue(1), DL);
14925
14926	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14927	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14928	MFI.setAdjustsStack(true);
14929
14930	// And our return value (tls address) is in the standard call return value
14931	// location.
14932	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14933	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14934	}
14935
14936	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
14937	Subtarget.isTargetWindowsItanium() \|\|
14938	Subtarget.isTargetWindowsGNU()) {
14939	// Just use the implicit TLS architecture
14940	// Need to generate something similar to:
14941	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14942	// ; from TEB
14943	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14944	// mov rcx, qword [rdx+rcx*8]
14945	// mov eax, .tls$:tlsvar
14946	// [rax+rcx] contains the address
14947	// Windows 64bit: gs:0x58
14948	// Windows 32bit: fs:__tls_array
14949
14950	SDLoc dl(GA);
14951	SDValue Chain = DAG.getEntryNode();
14952
14953	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14954	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14955	// use its literal value of 0x2C.
14956	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14957	? Type::getInt8PtrTy(*DAG.getContext(),
14958	256)
14959	: Type::getInt32PtrTy(*DAG.getContext(),
14960	257));
14961
14962	SDValue TlsArray = Subtarget.is64Bit()
14963	? DAG.getIntPtrConstant(0x58, dl)
14964	: (Subtarget.isTargetWindowsGNU()
14965	? DAG.getIntPtrConstant(0x2C, dl)
14966	: DAG.getExternalSymbol("_tls_array", PtrVT));
14967
14968	SDValue ThreadPointer =
14969	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14970
14971	SDValue res;
14972	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14973	res = ThreadPointer;
14974	} else {
14975	// Load the _tls_index variable
14976	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14977	if (Subtarget.is64Bit())
14978	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14979	MachinePointerInfo(), MVT::i32);
14980	else
14981	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14982
14983	auto &DL = DAG.getDataLayout();
14984	SDValue Scale =
14985	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14986	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14987
14988	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14989	}
14990
14991	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14992
14993	// Get the offset of start of .tls section
14994	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14995	GA->getValueType(0),
14996	GA->getOffset(), X86II::MO_SECREL);
14997	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14998
14999	// The address of the thread local variable is the add of the thread
15000	// pointer with the offset of the variable.
15001	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15002	}
15003
15004	llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target." , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15004);
15005	}
15006
15007	/// Lower SRA_PARTS and friends, which return two i32 values
15008	/// and take a 2 x i32 value to shift plus a shift amount.
15009	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15010	assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15010, __PRETTY_FUNCTION__));
15011	MVT VT = Op.getSimpleValueType();
15012	unsigned VTBits = VT.getSizeInBits();
15013	SDLoc dl(Op);
15014	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15015	SDValue ShOpLo = Op.getOperand(0);
15016	SDValue ShOpHi = Op.getOperand(1);
15017	SDValue ShAmt = Op.getOperand(2);
15018	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15019	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15020	// during isel.
15021	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15022	DAG.getConstant(VTBits - 1, dl, MVT::i8));
15023	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15024	DAG.getConstant(VTBits - 1, dl, MVT::i8))
15025	: DAG.getConstant(0, dl, VT);
15026
15027	SDValue Tmp2, Tmp3;
15028	if (Op.getOpcode() == ISD::SHL_PARTS) {
15029	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15030	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15031	} else {
15032	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15033	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15034	}
15035
15036	// If the shift amount is larger or equal than the width of a part we can't
15037	// rely on the results of shld/shrd. Insert a test and select the appropriate
15038	// values for large shift amounts.
15039	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15040	DAG.getConstant(VTBits, dl, MVT::i8));
15041	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15042	AndNode, DAG.getConstant(0, dl, MVT::i8));
15043
15044	SDValue Hi, Lo;
15045	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15046	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15047	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15048
15049	if (Op.getOpcode() == ISD::SHL_PARTS) {
15050	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15051	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15052	} else {
15053	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15054	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15055	}
15056
15057	SDValue Ops[2] = { Lo, Hi };
15058	return DAG.getMergeValues(Ops, dl);
15059	}
15060
15061	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15062	SelectionDAG &DAG) const {
15063	SDValue Src = Op.getOperand(0);
15064	MVT SrcVT = Src.getSimpleValueType();
15065	MVT VT = Op.getSimpleValueType();
15066	SDLoc dl(Op);
15067
15068	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15069	if (SrcVT.isVector()) {
15070	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15071	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15072	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15073	DAG.getUNDEF(SrcVT)));
15074	}
15075	if (SrcVT.getVectorElementType() == MVT::i1) {
15076	if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15077	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15078	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15079	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15080	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15081	DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15082	}
15083	return SDValue();
15084	}
15085
15086	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? static_cast<void> (0 ) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15087, __PRETTY_FUNCTION__))
15087	"Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? static_cast<void> (0 ) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15087, __PRETTY_FUNCTION__));
15088
15089	// These are really Legal; return the operand so the caller accepts it as
15090	// Legal.
15091	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15092	return Op;
15093	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15094	Subtarget.is64Bit()) {
15095	return Op;
15096	}
15097
15098	SDValue ValueToStore = Op.getOperand(0);
15099	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15100	!Subtarget.is64Bit())
15101	// Bitcasting to f64 here allows us to do a single 64-bit store from
15102	// an SSE register, avoiding the store forwarding penalty that would come
15103	// with two 32-bit stores.
15104	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15105
15106	unsigned Size = SrcVT.getSizeInBits()/8;
15107	MachineFunction &MF = DAG.getMachineFunction();
15108	auto PtrVT = getPointerTy(MF.getDataLayout());
15109	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15110	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15111	SDValue Chain = DAG.getStore(
15112	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15113	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15114	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15115	}
15116
15117	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15118	SDValue StackSlot,
15119	SelectionDAG &DAG) const {
15120	// Build the FILD
15121	SDLoc DL(Op);
15122	SDVTList Tys;
15123	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15124	if (useSSE)
15125	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15126	else
15127	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15128
15129	unsigned ByteSize = SrcVT.getSizeInBits()/8;
15130
15131	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15132	MachineMemOperand *MMO;
15133	if (FI) {
15134	int SSFI = FI->getIndex();
15135	MMO = DAG.getMachineFunction().getMachineMemOperand(
15136	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15137	MachineMemOperand::MOLoad, ByteSize, ByteSize);
15138	} else {
15139	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15140	StackSlot = StackSlot.getOperand(1);
15141	}
15142	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15143	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15144	X86ISD::FILD, DL,
15145	Tys, Ops, SrcVT, MMO);
15146
15147	if (useSSE) {
15148	Chain = Result.getValue(1);
15149	SDValue InFlag = Result.getValue(2);
15150
15151	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
15152	// shouldn't be necessary except that RFP cannot be live across
15153	// multiple blocks. When stackifier is fixed, they can be uncoupled.
15154	MachineFunction &MF = DAG.getMachineFunction();
15155	unsigned SSFISize = Op.getValueSizeInBits()/8;
15156	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15157	auto PtrVT = getPointerTy(MF.getDataLayout());
15158	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15159	Tys = DAG.getVTList(MVT::Other);
15160	SDValue Ops[] = {
15161	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15162	};
15163	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15164	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15165	MachineMemOperand::MOStore, SSFISize, SSFISize);
15166
15167	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15168	Ops, Op.getValueType(), MMO);
15169	Result = DAG.getLoad(
15170	Op.getValueType(), DL, Chain, StackSlot,
15171	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15172	}
15173
15174	return Result;
15175	}
15176
15177	/// 64-bit unsigned integer to double expansion.
15178	SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15179	SelectionDAG &DAG) const {
15180	// This algorithm is not obvious. Here it is what we're trying to output:
15181	/*
15182	movq %rax, %xmm0
15183	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15184	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15185	#ifdef __SSE3__
15186	haddpd %xmm0, %xmm0
15187	#else
15188	pshufd $0x4e, %xmm0, %xmm1
15189	addpd %xmm1, %xmm0
15190	#endif
15191	*/
15192
15193	SDLoc dl(Op);
15194	LLVMContext *Context = DAG.getContext();
15195
15196	// Build some magic constants.
15197	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15198	Constant C0 = ConstantDataVector::get(Context, CV0);
15199	auto PtrVT = getPointerTy(DAG.getDataLayout());
15200	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15201
15202	SmallVector<Constant*,2> CV1;
15203	CV1.push_back(
15204	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15205	APInt(64, 0x4330000000000000ULL))));
15206	CV1.push_back(
15207	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15208	APInt(64, 0x4530000000000000ULL))));
15209	Constant *C1 = ConstantVector::get(CV1);
15210	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15211
15212	// Load the 64-bit value into an XMM register.
15213	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15214	Op.getOperand(0));
15215	SDValue CLod0 =
15216	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15217	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15218	/* Alignment = */ 16);
15219	SDValue Unpck1 =
15220	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15221
15222	SDValue CLod1 =
15223	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15224	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15225	/* Alignment = */ 16);
15226	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15227	// TODO: Are there any fast-math-flags to propagate here?
15228	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15229	SDValue Result;
15230
15231	if (Subtarget.hasSSE3()) {
15232	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15233	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15234	} else {
15235	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15236	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15237	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15238	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15239	}
15240
15241	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15242	DAG.getIntPtrConstant(0, dl));
15243	}
15244
15245	/// 32-bit unsigned integer to float expansion.
15246	SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15247	SelectionDAG &DAG) const {
15248	SDLoc dl(Op);
15249	// FP constant to bias correct the final result.
15250	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15251	MVT::f64);
15252
15253	// Load the 32-bit value into an XMM register.
15254	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15255	Op.getOperand(0));
15256
15257	// Zero out the upper parts of the register.
15258	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15259
15260	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15261	DAG.getBitcast(MVT::v2f64, Load),
15262	DAG.getIntPtrConstant(0, dl));
15263
15264	// Or the load with the bias.
15265	SDValue Or = DAG.getNode(
15266	ISD::OR, dl, MVT::v2i64,
15267	DAG.getBitcast(MVT::v2i64,
15268	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15269	DAG.getBitcast(MVT::v2i64,
15270	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15271	Or =
15272	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15273	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15274
15275	// Subtract the bias.
15276	// TODO: Are there any fast-math-flags to propagate here?
15277	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15278
15279	// Handle final rounding.
15280	MVT DestVT = Op.getSimpleValueType();
15281
15282	if (DestVT.bitsLT(MVT::f64))
15283	return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15284	DAG.getIntPtrConstant(0, dl));
15285	if (DestVT.bitsGT(MVT::f64))
15286	return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15287
15288	// Handle final rounding.
15289	return Sub;
15290	}
15291
15292	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15293	const X86Subtarget &Subtarget, SDLoc &DL) {
15294	if (Op.getSimpleValueType() != MVT::v2f64)
15295	return SDValue();
15296
15297	SDValue N0 = Op.getOperand(0);
15298	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type" ) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15298, __PRETTY_FUNCTION__));
15299
15300	// Legalize to v4i32 type.
15301	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15302	DAG.getUNDEF(MVT::v2i32));
15303
15304	if (Subtarget.hasAVX512())
15305	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15306
15307	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15308	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15309	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15310	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15311
15312	// Two to the power of half-word-size.
15313	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15314
15315	// Clear upper part of LO, lower HI.
15316	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15317	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15318
15319	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15320	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15321	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15322
15323	// Add the two halves.
15324	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15325	}
15326
15327	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15328	const X86Subtarget &Subtarget) {
15329	// The algorithm is the following:
15330	// #ifdef __SSE4_1__
15331	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15332	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15333	// (uint4) 0x53000000, 0xaa);
15334	// #else
15335	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
15336	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
15337	// #endif
15338	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15339	// return (float4) lo + fhi;
15340
15341	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
15342	// reassociate the two FADDs, and if we do that, the algorithm fails
15343	// spectacularly (PR24512).
15344	// FIXME: If we ever have some kind of Machine FMF, this should be marked
15345	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15346	// there's also the MachineCombiner reassociations happening on Machine IR.
15347	if (DAG.getTarget().Options.UnsafeFPMath)
15348	return SDValue();
15349
15350	SDLoc DL(Op);
15351	SDValue V = Op->getOperand(0);
15352	MVT VecIntVT = V.getSimpleValueType();
15353	bool Is128 = VecIntVT == MVT::v4i32;
15354	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15355	// If we convert to something else than the supported type, e.g., to v4f64,
15356	// abort early.
15357	if (VecFloatVT != Op->getSimpleValueType(0))
15358	return SDValue();
15359
15360	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) && "Unsupported custom type") ? static_cast<void> (0) : __assert_fail ("(VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) && \"Unsupported custom type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15361, __PRETTY_FUNCTION__))
15361	"Unsupported custom type")(((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) && "Unsupported custom type") ? static_cast<void> (0) : __assert_fail ("(VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) && \"Unsupported custom type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15361, __PRETTY_FUNCTION__));
15362
15363	// In the #idef/#else code, we have in common:
15364	// - The vector of constants:
15365	// -- 0x4b000000
15366	// -- 0x53000000
15367	// - A shift:
15368	// -- v >> 16
15369
15370	// Create the splat vector for 0x4b000000.
15371	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15372	// Create the splat vector for 0x53000000.
15373	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15374
15375	// Create the right shift.
15376	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15377	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15378
15379	SDValue Low, High;
15380	if (Subtarget.hasSSE41()) {
15381	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15382	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15383	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15384	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15385	// Low will be bitcasted right away, so do not bother bitcasting back to its
15386	// original type.
15387	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15388	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15389	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15390	// (uint4) 0x53000000, 0xaa);
15391	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15392	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15393	// High will be bitcasted right away, so do not bother bitcasting back to
15394	// its original type.
15395	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15396	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15397	} else {
15398	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15399	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
15400	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15401	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15402
15403	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
15404	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15405	}
15406
15407	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15408	SDValue VecCstFAdd = DAG.getConstantFP(
15409	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15410
15411	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15412	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15413	// TODO: Are there any fast-math-flags to propagate here?
15414	SDValue FHigh =
15415	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15416	// return (float4) lo + fhi;
15417	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15418	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15419	}
15420
15421	SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15422	SelectionDAG &DAG) const {
15423	SDValue N0 = Op.getOperand(0);
15424	MVT SrcVT = N0.getSimpleValueType();
15425	SDLoc dl(Op);
15426
15427	if (SrcVT.getVectorElementType() == MVT::i1) {
15428	if (SrcVT == MVT::v2i1)
15429	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15430	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15431	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15432	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15433	DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15434	}
15435
15436	switch (SrcVT.SimpleTy) {
15437	default:
15438	llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15438);
15439	case MVT::v4i8:
15440	case MVT::v4i16:
15441	case MVT::v8i8:
15442	case MVT::v8i16: {
15443	MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15444	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15445	DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15446	}
15447	case MVT::v2i32:
15448	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15449	case MVT::v4i32:
15450	case MVT::v8i32:
15451	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15452	case MVT::v16i8:
15453	case MVT::v16i16:
15454	assert(Subtarget.hasAVX512())((Subtarget.hasAVX512()) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15454, __PRETTY_FUNCTION__));
15455	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15456	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15457	}
15458	}
15459
15460	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15461	SelectionDAG &DAG) const {
15462	SDValue N0 = Op.getOperand(0);
15463	SDLoc dl(Op);
15464	auto PtrVT = getPointerTy(DAG.getDataLayout());
15465
15466	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15467	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15468	// the optimization here.
15469	if (DAG.SignBitIsZero(N0))
15470	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15471
15472	if (Op.getSimpleValueType().isVector())
15473	return lowerUINT_TO_FP_vec(Op, DAG);
15474
15475	MVT SrcVT = N0.getSimpleValueType();
15476	MVT DstVT = Op.getSimpleValueType();
15477
15478	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15479	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15480	// Conversions from unsigned i32 to f32/f64 are legal,
15481	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15482	return Op;
15483	}
15484
15485	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15486	return LowerUINT_TO_FP_i64(Op, DAG);
15487	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15488	return LowerUINT_TO_FP_i32(Op, DAG);
15489	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15490	return SDValue();
15491
15492	// Make a 64-bit buffer, and use it to build an FILD.
15493	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15494	if (SrcVT == MVT::i32) {
15495	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15496	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15497	StackSlot, MachinePointerInfo());
15498	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15499	OffsetSlot, MachinePointerInfo());
15500	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15501	return Fild;
15502	}
15503
15504	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP" ) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15504, __PRETTY_FUNCTION__));
15505	SDValue ValueToStore = Op.getOperand(0);
15506	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15507	// Bitcasting to f64 here allows us to do a single 64-bit store from
15508	// an SSE register, avoiding the store forwarding penalty that would come
15509	// with two 32-bit stores.
15510	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15511	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15512	MachinePointerInfo());
15513	// For i64 source, we need to add the appropriate power of 2 if the input
15514	// was negative. This is the same as the optimization in
15515	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15516	// we must be careful to do the computation in x87 extended precision, not
15517	// in SSE. (The generic code can't know it's OK to do this, or how to.)
15518	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15519	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15520	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15521	MachineMemOperand::MOLoad, 8, 8);
15522
15523	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15524	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15525	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15526	MVT::i64, MMO);
15527
15528	APInt FF(32, 0x5F800000ULL);
15529
15530	// Check whether the sign bit is set.
15531	SDValue SignSet = DAG.getSetCC(
15532	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15533	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15534
15535	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15536	SDValue FudgePtr = DAG.getConstantPool(
15537	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15538
15539	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15540	SDValue Zero = DAG.getIntPtrConstant(0, dl);
15541	SDValue Four = DAG.getIntPtrConstant(4, dl);
15542	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15543	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15544
15545	// Load the value out, extending it from f32 to f80.
15546	// FIXME: Avoid the extend by constructing the right constant pool?
15547	SDValue Fudge = DAG.getExtLoad(
15548	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15549	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15550	/* Alignment = */ 4);
15551	// Extend everything to 80 bits to force it to be done on x87.
15552	// TODO: Are there any fast-math-flags to propagate here?
15553	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15554	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15555	DAG.getIntPtrConstant(0, dl));
15556	}
15557
15558	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15559	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15560	// just return an <SDValue(), SDValue()> pair.
15561	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15562	// to i16, i32 or i64, and we lower it to a legal sequence.
15563	// If lowered to the final integer result we return a <result, SDValue()> pair.
15564	// Otherwise we lower it to a sequence ending with a FIST, return a
15565	// <FIST, StackSlot> pair, and the caller is responsible for loading
15566	// the final integer result from StackSlot.
15567	std::pair<SDValue,SDValue>
15568	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15569	bool IsSigned, bool IsReplace) const {
15570	SDLoc DL(Op);
15571
15572	EVT DstTy = Op.getValueType();
15573	EVT TheVT = Op.getOperand(0).getValueType();
15574	auto PtrVT = getPointerTy(DAG.getDataLayout());
15575
15576	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15577	// f16 must be promoted before using the lowering in this routine.
15578	// fp128 does not use this lowering.
15579	return std::make_pair(SDValue(), SDValue());
15580	}
15581
15582	// If using FIST to compute an unsigned i64, we'll need some fixup
15583	// to handle values above the maximum signed i64. A FIST is always
15584	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15585	bool UnsignedFixup = !IsSigned &&
15586	DstTy == MVT::i64 &&
15587	(!Subtarget.is64Bit() \|\|
15588	!isScalarFPTypeInSSEReg(TheVT));
15589
15590	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15591	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15592	// The low 32 bits of the fist result will have the correct uint32 result.
15593	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast <void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15593, __PRETTY_FUNCTION__));
15594	DstTy = MVT::i64;
15595	}
15596
15597	assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT () >= MVT::i16 && "Unknown FP_TO_INT to lower!") ? static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15599, __PRETTY_FUNCTION__))
15598	DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT () >= MVT::i16 && "Unknown FP_TO_INT to lower!") ? static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15599, __PRETTY_FUNCTION__))
15599	"Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT () >= MVT::i16 && "Unknown FP_TO_INT to lower!") ? static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15599, __PRETTY_FUNCTION__));
15600
15601	// These are really Legal.
15602	if (DstTy == MVT::i32 &&
15603	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15604	return std::make_pair(SDValue(), SDValue());
15605	if (Subtarget.is64Bit() &&
15606	DstTy == MVT::i64 &&
15607	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15608	return std::make_pair(SDValue(), SDValue());
15609
15610	// We lower FP->int64 into FISTP64 followed by a load from a temporary
15611	// stack slot.
15612	MachineFunction &MF = DAG.getMachineFunction();
15613	unsigned MemSize = DstTy.getSizeInBits()/8;
15614	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15615	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15616
15617	unsigned Opc;
15618	switch (DstTy.getSimpleVT().SimpleTy) {
15619	default: llvm_unreachable("Invalid FP_TO_SINT to lower!")::llvm::llvm_unreachable_internal("Invalid FP_TO_SINT to lower!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15619);
15620	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15621	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15622	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15623	}
15624
15625	SDValue Chain = DAG.getEntryNode();
15626	SDValue Value = Op.getOperand(0);
15627	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15628
15629	if (UnsignedFixup) {
15630	//
15631	// Conversion to unsigned i64 is implemented with a select,
15632	// depending on whether the source value fits in the range
15633	// of a signed i64. Let Thresh be the FP equivalent of
15634	// 0x8000000000000000ULL.
15635	//
15636	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15637	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15638	// Fist-to-mem64 FistSrc
15639	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15640	// to XOR'ing the high 32 bits with Adjust.
15641	//
15642	// Being a power of 2, Thresh is exactly representable in all FP formats.
15643	// For X87 we'd like to use the smallest FP type for this constant, but
15644	// for DAG type consistency we have to match the FP operand type.
15645
15646	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15647	LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
15648	bool LosesInfo = false;
15649	if (TheVT == MVT::f64)
15650	// The rounding mode is irrelevant as the conversion should be exact.
15651	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15652	&LosesInfo);
15653	else if (TheVT == MVT::f80)
15654	Status = Thresh.convert(APFloat::x87DoubleExtended(),
15655	APFloat::rmNearestTiesToEven, &LosesInfo);
15656
15657	assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact" ) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15658, __PRETTY_FUNCTION__))
15658	"FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact" ) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15658, __PRETTY_FUNCTION__));
15659
15660	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15661
15662	SDValue Cmp = DAG.getSetCC(DL,
15663	getSetCCResultType(DAG.getDataLayout(),
15664	*DAG.getContext(), TheVT),
15665	Value, ThreshVal, ISD::SETLT);
15666	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15667	DAG.getConstant(0, DL, MVT::i32),
15668	DAG.getConstant(0x80000000, DL, MVT::i32));
15669	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15670	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15671	*DAG.getContext(), TheVT),
15672	Value, ThreshVal, ISD::SETLT);
15673	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15674	}
15675
15676	// FIXME This causes a redundant load/store if the SSE-class value is already
15677	// in memory, such as if it is on the callstack.
15678	if (isScalarFPTypeInSSEReg(TheVT)) {
15679	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!" ) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15679, __PRETTY_FUNCTION__));
15680	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15681	MachinePointerInfo::getFixedStack(MF, SSFI));
15682	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15683	SDValue Ops[] = {
15684	Chain, StackSlot, DAG.getValueType(TheVT)
15685	};
15686
15687	MachineMemOperand *MMO =
15688	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15689	MachineMemOperand::MOLoad, MemSize, MemSize);
15690	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15691	Chain = Value.getValue(1);
15692	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15693	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15694	}
15695
15696	MachineMemOperand *MMO =
15697	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15698	MachineMemOperand::MOStore, MemSize, MemSize);
15699
15700	if (UnsignedFixup) {
15701
15702	// Insert the FIST, load its result as two i32's,
15703	// and XOR the high i32 with Adjust.
15704
15705	SDValue FistOps[] = { Chain, Value, StackSlot };
15706	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15707	FistOps, DstTy, MMO);
15708
15709	SDValue Low32 =
15710	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15711	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15712
15713	SDValue High32 =
15714	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15715	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15716
15717	if (Subtarget.is64Bit()) {
15718	// Join High32 and Low32 into a 64-bit result.
15719	// (High32 << 32) \| Low32
15720	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15721	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15722	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15723	DAG.getConstant(32, DL, MVT::i8));
15724	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15725	return std::make_pair(Result, SDValue());
15726	}
15727
15728	SDValue ResultOps[] = { Low32, High32 };
15729
15730	SDValue pair = IsReplace
15731	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15732	: DAG.getMergeValues(ResultOps, DL);
15733	return std::make_pair(pair, SDValue());
15734	} else {
15735	// Build the FP_TO_INT*_IN_MEM
15736	SDValue Ops[] = { Chain, Value, StackSlot };
15737	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15738	Ops, DstTy, MMO);
15739	return std::make_pair(FIST, StackSlot);
15740	}
15741	}
15742
15743	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15744	const X86Subtarget &Subtarget) {
15745	MVT VT = Op->getSimpleValueType(0);
15746	SDValue In = Op->getOperand(0);
15747	MVT InVT = In.getSimpleValueType();
15748	SDLoc dl(Op);
15749
15750	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
15751	return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15752
15753	// Optimize vectors in AVX mode:
15754	//
15755	// v8i16 -> v8i32
15756	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15757	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15758	// Concat upper and lower parts.
15759	//
15760	// v4i32 -> v4i64
15761	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15762	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15763	// Concat upper and lower parts.
15764	//
15765
15766	if (((VT != MVT::v16i16) \|\| (InVT != MVT::v16i8)) &&
15767	((VT != MVT::v8i32) \|\| (InVT != MVT::v8i16)) &&
15768	((VT != MVT::v4i64) \|\| (InVT != MVT::v4i32)))
15769	return SDValue();
15770
15771	if (Subtarget.hasInt256())
15772	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15773
15774	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15775	SDValue Undef = DAG.getUNDEF(InVT);
15776	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15777	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15778	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15779
15780	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15781	VT.getVectorNumElements()/2);
15782
15783	OpLo = DAG.getBitcast(HVT, OpLo);
15784	OpHi = DAG.getBitcast(HVT, OpHi);
15785
15786	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15787	}
15788
15789	static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15790	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15791	MVT VT = Op->getSimpleValueType(0);
15792	SDValue In = Op->getOperand(0);
15793	MVT InVT = In.getSimpleValueType();
15794	SDLoc DL(Op);
15795	unsigned NumElts = VT.getVectorNumElements();
15796
15797	if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15798	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI()))
15799	return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15800
15801	if (InVT.getVectorElementType() != MVT::i1)
15802	return SDValue();
15803
15804	// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15805	MVT ExtVT = VT;
15806	if (!VT.is512BitVector() && !Subtarget.hasVLX())
15807	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15808
15809	SDValue One =
15810	DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15811	SDValue Zero =
15812	DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15813
15814	SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15815	if (VT == ExtVT)
15816	return SelectedVal;
15817	return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15818	}
15819
15820	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15821	SelectionDAG &DAG) {
15822	if (Subtarget.hasFp256())
15823	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15824	return Res;
15825
15826	return SDValue();
15827	}
15828
15829	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15830	SelectionDAG &DAG) {
15831	SDLoc DL(Op);
15832	MVT VT = Op.getSimpleValueType();
15833	SDValue In = Op.getOperand(0);
15834	MVT SVT = In.getSimpleValueType();
15835
15836	if (VT.is512BitVector() \|\| SVT.getVectorElementType() == MVT::i1)
15837	return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15838
15839	if (Subtarget.hasFp256())
15840	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15841	return Res;
15842
15843	assert(!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\|((!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\| VT.getVectorNumElements () != SVT.getVectorNumElements()) ? static_cast<void> ( 0) : __assert_fail ("!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\| VT.getVectorNumElements() != SVT.getVectorNumElements()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15844, __PRETTY_FUNCTION__))
15844	VT.getVectorNumElements() != SVT.getVectorNumElements())((!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\| VT.getVectorNumElements () != SVT.getVectorNumElements()) ? static_cast<void> ( 0) : __assert_fail ("!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\| VT.getVectorNumElements() != SVT.getVectorNumElements()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15844, __PRETTY_FUNCTION__));
15845	return SDValue();
15846	}
15847
15848	/// Helper to recursively truncate vector elements in half with PACKSS.
15849	/// It makes use of the fact that vector comparison results will be all-zeros
15850	/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15851	/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15852	/// within each 128-bit lane.
15853	static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15854	const SDLoc &DL,
15855	SelectionDAG &DAG,
15856	const X86Subtarget &Subtarget) {
15857	// Requires SSE2 but AVX512 has fast truncate.
15858	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
15859	return SDValue();
15860
15861	EVT SrcVT = In.getValueType();
15862
15863	// No truncation required, we might get here due to recursive calls.
15864	if (SrcVT == DstVT)
15865	return In;
15866
15867	// We only support vector truncation to 128bits or greater from a
15868	// 256bits or greater source.
15869	if ((DstVT.getSizeInBits() % 128) != 0)
15870	return SDValue();
15871	if ((SrcVT.getSizeInBits() % 256) != 0)
15872	return SDValue();
15873
15874	unsigned NumElems = SrcVT.getVectorNumElements();
15875	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation" ) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15875, __PRETTY_FUNCTION__));
15876	assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation")((SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation") ? static_cast<void> (0) : __assert_fail ("SrcVT.getSizeInBits() > DstVT.getSizeInBits() && \"Illegal truncation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15876, __PRETTY_FUNCTION__));
15877
15878	EVT PackedSVT =
15879	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15880
15881	// Extract lower/upper subvectors.
15882	unsigned NumSubElts = NumElems / 2;
15883	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15884	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15885	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15886
15887	// 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15888	if (SrcVT.is256BitVector()) {
15889	Lo = DAG.getBitcast(MVT::v8i16, Lo);
15890	Hi = DAG.getBitcast(MVT::v8i16, Hi);
15891	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15892	return DAG.getBitcast(DstVT, Res);
15893	}
15894
15895	// AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15896	// AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15897	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15898	Lo = DAG.getBitcast(MVT::v16i16, Lo);
15899	Hi = DAG.getBitcast(MVT::v16i16, Hi);
15900	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15901
15902	// 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15903	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15904	Res = DAG.getBitcast(MVT::v4i64, Res);
15905	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15906
15907	if (DstVT.is256BitVector())
15908	return DAG.getBitcast(DstVT, Res);
15909
15910	// If 512bit -> 128bit truncate another stage.
15911	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15912	Res = DAG.getBitcast(PackedVT, Res);
15913	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15914	}
15915
15916	// Recursively pack lower/upper subvectors, concat result and pack again.
15917	assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater")((SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater" ) ? static_cast<void> (0) : __assert_fail ("SrcVT.getSizeInBits() >= 512 && \"Expected 512-bit vector or greater\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15917, __PRETTY_FUNCTION__));
15918	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15919	Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15920	Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15921
15922	PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15923	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15924	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15925	}
15926
15927	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15928	const X86Subtarget &Subtarget) {
15929
15930	SDLoc DL(Op);
15931	MVT VT = Op.getSimpleValueType();
15932	SDValue In = Op.getOperand(0);
15933	MVT InVT = In.getSimpleValueType();
15934
15935	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type." ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15935, __PRETTY_FUNCTION__));
15936
15937	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15938	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15939	if (InVT.getScalarSizeInBits() <= 16) {
15940	if (Subtarget.hasBWI()) {
15941	// legal, will go to VPMOVB2M, VPMOVW2M
15942	// Shift packed bytes not supported natively, bitcast to word
15943	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15944	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15945	DAG.getBitcast(ExtVT, In),
15946	DAG.getConstant(ShiftInx, DL, ExtVT));
15947	ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15948	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15949	}
15950	// Use TESTD/Q, extended vector to packed dword/qword.
15951	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&(((InVT.is256BitVector() \|\| InVT.is128BitVector()) && "Unexpected vector type.") ? static_cast<void> (0) : __assert_fail ("(InVT.is256BitVector() \|\| InVT.is128BitVector()) && \"Unexpected vector type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15952, __PRETTY_FUNCTION__))
15952	"Unexpected vector type.")(((InVT.is256BitVector() \|\| InVT.is128BitVector()) && "Unexpected vector type.") ? static_cast<void> (0) : __assert_fail ("(InVT.is256BitVector() \|\| InVT.is128BitVector()) && \"Unexpected vector type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15952, __PRETTY_FUNCTION__));
15953	unsigned NumElts = InVT.getVectorNumElements();
15954	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15955	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15956	InVT = ExtVT;
15957	ShiftInx = InVT.getScalarSizeInBits() - 1;
15958	}
15959
15960	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15961	DAG.getConstant(ShiftInx, DL, InVT));
15962	return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15963	}
15964
15965	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15966	SDLoc DL(Op);
15967	MVT VT = Op.getSimpleValueType();
15968	SDValue In = Op.getOperand(0);
15969	MVT InVT = In.getSimpleValueType();
15970
15971	if (VT == MVT::i1) {
15972	assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&(((InVT.isInteger() && (InVT.getSizeInBits() <= 64 )) && "Invalid scalar TRUNCATE operation") ? static_cast <void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15973, __PRETTY_FUNCTION__))
15973	"Invalid scalar TRUNCATE operation")(((InVT.isInteger() && (InVT.getSizeInBits() <= 64 )) && "Invalid scalar TRUNCATE operation") ? static_cast <void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15973, __PRETTY_FUNCTION__));
15974	if (InVT.getSizeInBits() >= 32)
15975	return SDValue();
15976	In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15977	return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15978	}
15979	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation") ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15980, __PRETTY_FUNCTION__))
15980	"Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation") ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 15980, __PRETTY_FUNCTION__));
15981
15982	if (VT.getVectorElementType() == MVT::i1)
15983	return LowerTruncateVecI1(Op, DAG, Subtarget);
15984
15985	// vpmovqb/w/d, vpmovdb/w, vpmovwb
15986	if (Subtarget.hasAVX512()) {
15987	// word to byte only under BWI
15988	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15989	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15990	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15991	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15992	}
15993
15994	// Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15995	if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15996	if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15997	return V;
15998
15999	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16000	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
16001	if (Subtarget.hasInt256()) {
16002	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16003	In = DAG.getBitcast(MVT::v8i32, In);
16004	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16005	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16006	DAG.getIntPtrConstant(0, DL));
16007	}
16008
16009	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16010	DAG.getIntPtrConstant(0, DL));
16011	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16012	DAG.getIntPtrConstant(2, DL));
16013	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16014	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16015	static const int ShufMask[] = {0, 2, 4, 6};
16016	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16017	}
16018
16019	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16020	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16021	if (Subtarget.hasInt256()) {
16022	In = DAG.getBitcast(MVT::v32i8, In);
16023
16024	// The PSHUFB mask:
16025	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16026	-1, -1, -1, -1, -1, -1, -1, -1,
16027	16, 17, 20, 21, 24, 25, 28, 29,
16028	-1, -1, -1, -1, -1, -1, -1, -1 };
16029	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16030	In = DAG.getBitcast(MVT::v4i64, In);
16031
16032	static const int ShufMask2[] = {0, 2, -1, -1};
16033	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16034	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16035	DAG.getIntPtrConstant(0, DL));
16036	return DAG.getBitcast(VT, In);
16037	}
16038
16039	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16040	DAG.getIntPtrConstant(0, DL));
16041
16042	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16043	DAG.getIntPtrConstant(4, DL));
16044
16045	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16046	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16047
16048	// The PSHUFB mask:
16049	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16050	-1, -1, -1, -1, -1, -1, -1, -1};
16051
16052	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16053	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16054
16055	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16056	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16057
16058	// The MOVLHPS Mask:
16059	static const int ShufMask2[] = {0, 1, 4, 5};
16060	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16061	return DAG.getBitcast(MVT::v8i16, res);
16062	}
16063
16064	// Handle truncation of V256 to V128 using shuffles.
16065	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
16066	return SDValue();
16067
16068	assert(Subtarget.hasFp256() && "256-bit vector without AVX!")((Subtarget.hasFp256() && "256-bit vector without AVX!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasFp256() && \"256-bit vector without AVX!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16068, __PRETTY_FUNCTION__));
16069
16070	unsigned NumElems = VT.getVectorNumElements();
16071	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16072
16073	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16074	// Prepare truncation shuffle mask
16075	for (unsigned i = 0; i != NumElems; ++i)
16076	MaskVec[i] = i * 2;
16077	In = DAG.getBitcast(NVT, In);
16078	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16079	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16080	DAG.getIntPtrConstant(0, DL));
16081	}
16082
16083	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16084	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16085	MVT VT = Op.getSimpleValueType();
16086
16087	if (VT.isVector()) {
16088	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")((Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16088, __PRETTY_FUNCTION__));
16089	SDValue Src = Op.getOperand(0);
16090	SDLoc dl(Op);
16091	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16092	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16093	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16094	DAG.getUNDEF(MVT::v2f32)));
16095	}
16096
16097	return SDValue();
16098	}
16099
16100	assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail ("!VT.isVector()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16100, __PRETTY_FUNCTION__));
16101
16102	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16103	IsSigned, /IsReplace=/ false);
16104	SDValue FIST = Vals.first, StackSlot = Vals.second;
16105	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16106	if (!FIST.getNode())
16107	return Op;
16108
16109	if (StackSlot.getNode())
16110	// Load the result.
16111	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16112
16113	// The node is the result.
16114	return FIST;
16115	}
16116
16117	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16118	SDLoc DL(Op);
16119	MVT VT = Op.getSimpleValueType();
16120	SDValue In = Op.getOperand(0);
16121	MVT SVT = In.getSimpleValueType();
16122
16123	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!" ) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16123, __PRETTY_FUNCTION__));
16124
16125	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16126	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16127	In, DAG.getUNDEF(SVT)));
16128	}
16129
16130	/// The only differences between FABS and FNEG are the mask and the logic op.
16131	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
16132	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16133	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG ) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast <void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16134, __PRETTY_FUNCTION__))
16134	"Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG ) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast <void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16134, __PRETTY_FUNCTION__));
16135
16136	bool IsFABS = (Op.getOpcode() == ISD::FABS);
16137
16138	// If this is a FABS and it has an FNEG user, bail out to fold the combination
16139	// into an FNABS. We'll lower the FABS after that if it is still in use.
16140	if (IsFABS)
16141	for (SDNode *User : Op->uses())
16142	if (User->getOpcode() == ISD::FNEG)
16143	return Op;
16144
16145	SDLoc dl(Op);
16146	MVT VT = Op.getSimpleValueType();
16147
16148	bool IsF128 = (VT == MVT::f128);
16149
16150	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16151	// decide if we should generate a 16-byte constant mask when we only need 4 or
16152	// 8 bytes for the scalar case.
16153
16154	MVT LogicVT;
16155	MVT EltVT;
16156
16157	if (VT.isVector()) {
16158	LogicVT = VT;
16159	EltVT = VT.getVectorElementType();
16160	} else if (IsF128) {
16161	// SSE instructions are used for optimized f128 logical operations.
16162	LogicVT = MVT::f128;
16163	EltVT = VT;
16164	} else {
16165	// There are no scalar bitwise logical SSE/AVX instructions, so we
16166	// generate a 16-byte vector constant and logic op even for the scalar case.
16167	// Using a 16-byte mask allows folding the load of the mask with
16168	// the logic op, so it can save (~4 bytes) on code size.
16169	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16170	EltVT = VT;
16171	}
16172
16173	unsigned EltBits = EltVT.getSizeInBits();
16174	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16175	APInt MaskElt =
16176	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16177	const fltSemantics &Sem =
16178	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16179	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16180	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16181
16182	SDValue Op0 = Op.getOperand(0);
16183	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16184	unsigned LogicOp =
16185	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16186	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16187
16188	if (VT.isVector() \|\| IsF128)
16189	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16190
16191	// For the scalar case extend to a 128-bit vector, perform the logic op,
16192	// and extract the scalar result back out.
16193	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16194	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16195	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16196	DAG.getIntPtrConstant(0, dl));
16197	}
16198
16199	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16200	SDValue Mag = Op.getOperand(0);
16201	SDValue Sign = Op.getOperand(1);
16202	SDLoc dl(Op);
16203
16204	// If the sign operand is smaller, extend it first.
16205	MVT VT = Op.getSimpleValueType();
16206	if (Sign.getSimpleValueType().bitsLT(VT))
16207	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16208
16209	// And if it is bigger, shrink it first.
16210	if (Sign.getSimpleValueType().bitsGT(VT))
16211	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16212
16213	// At this point the operands and the result should have the same
16214	// type, and that won't be f80 since that is not custom lowered.
16215	bool IsF128 = (VT == MVT::f128);
16216	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|(((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT ::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN" ) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16219, __PRETTY_FUNCTION__))
16217	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|(((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT ::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN" ) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16219, __PRETTY_FUNCTION__))
16218	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&(((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT ::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN" ) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16219, __PRETTY_FUNCTION__))
16219	"Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT ::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN" ) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\| VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16219, __PRETTY_FUNCTION__));
16220
16221	MVT EltVT = VT.getScalarType();
16222	const fltSemantics &Sem =
16223	EltVT == MVT::f64 ? APFloat::IEEEdouble()
16224	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16225
16226	// Perform all scalar logic operations as 16-byte vectors because there are no
16227	// scalar FP logic instructions in SSE.
16228	// TODO: This isn't necessary. If we used scalar types, we might avoid some
16229	// unnecessary splats, but we might miss load folding opportunities. Should
16230	// this decision be based on OptimizeForSize?
16231	bool IsFakeVector = !VT.isVector() && !IsF128;
16232	MVT LogicVT = VT;
16233	if (IsFakeVector)
16234	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16235
16236	// The mask constants are automatically splatted for vector types.
16237	unsigned EltSizeInBits = VT.getScalarSizeInBits();
16238	SDValue SignMask = DAG.getConstantFP(
16239	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16240	SDValue MagMask = DAG.getConstantFP(
16241	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16242
16243	// First, clear all bits but the sign bit from the second operand (sign).
16244	if (IsFakeVector)
16245	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16246	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16247
16248	// Next, clear the sign bit from the first operand (magnitude).
16249	// TODO: If we had general constant folding for FP logic ops, this check
16250	// wouldn't be necessary.
16251	SDValue MagBits;
16252	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16253	APFloat APF = Op0CN->getValueAPF();
16254	APF.clearSign();
16255	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16256	} else {
16257	// If the magnitude operand wasn't a constant, we need to AND out the sign.
16258	if (IsFakeVector)
16259	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16260	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16261	}
16262
16263	// OR the magnitude value with the sign bit.
16264	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16265	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16266	DAG.getIntPtrConstant(0, dl));
16267	}
16268
16269	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16270	SDValue N0 = Op.getOperand(0);
16271	SDLoc dl(Op);
16272	MVT VT = Op.getSimpleValueType();
16273
16274	MVT OpVT = N0.getSimpleValueType();
16275	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&(((OpVT == MVT::f32 \|\| OpVT == MVT::f64) && "Unexpected type for FGETSIGN" ) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 \|\| OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16276, __PRETTY_FUNCTION__))
16276	"Unexpected type for FGETSIGN")(((OpVT == MVT::f32 \|\| OpVT == MVT::f64) && "Unexpected type for FGETSIGN" ) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 \|\| OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16276, __PRETTY_FUNCTION__));
16277
16278	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16279	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16280	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16281	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16282	Res = DAG.getZExtOrTrunc(Res, dl, VT);
16283	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16284	return Res;
16285	}
16286
16287	// Check whether an OR'd tree is PTEST-able.
16288	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16289	SelectionDAG &DAG) {
16290	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree." ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16290, __PRETTY_FUNCTION__));
16291
16292	if (!Subtarget.hasSSE41())
16293	return SDValue();
16294
16295	if (!Op->hasOneUse())
16296	return SDValue();
16297
16298	SDNode *N = Op.getNode();
16299	SDLoc DL(N);
16300
16301	SmallVector<SDValue, 8> Opnds;
16302	DenseMap<SDValue, unsigned> VecInMap;
16303	SmallVector<SDValue, 8> VecIns;
16304	EVT VT = MVT::Other;
16305
16306	// Recognize a special case where a vector is casted into wide integer to
16307	// test all 0s.
16308	Opnds.push_back(N->getOperand(0));
16309	Opnds.push_back(N->getOperand(1));
16310
16311	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16312	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16313	// BFS traverse all OR'd operands.
16314	if (I->getOpcode() == ISD::OR) {
16315	Opnds.push_back(I->getOperand(0));
16316	Opnds.push_back(I->getOperand(1));
16317	// Re-evaluate the number of nodes to be traversed.
16318	e += 2; // 2 more nodes (LHS and RHS) are pushed.
16319	continue;
16320	}
16321
16322	// Quit if a non-EXTRACT_VECTOR_ELT
16323	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16324	return SDValue();
16325
16326	// Quit if without a constant index.
16327	SDValue Idx = I->getOperand(1);
16328	if (!isa<ConstantSDNode>(Idx))
16329	return SDValue();
16330
16331	SDValue ExtractedFromVec = I->getOperand(0);
16332	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16333	if (M == VecInMap.end()) {
16334	VT = ExtractedFromVec.getValueType();
16335	// Quit if not 128/256-bit vector.
16336	if (!VT.is128BitVector() && !VT.is256BitVector())
16337	return SDValue();
16338	// Quit if not the same type.
16339	if (VecInMap.begin() != VecInMap.end() &&
16340	VT != VecInMap.begin()->first.getValueType())
16341	return SDValue();
16342	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16343	VecIns.push_back(ExtractedFromVec);
16344	}
16345	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16346	}
16347
16348	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&(((VT.is128BitVector() \|\| VT.is256BitVector()) && "Not extracted from 128-/256-bit vector." ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16349, __PRETTY_FUNCTION__))
16349	"Not extracted from 128-/256-bit vector.")(((VT.is128BitVector() \|\| VT.is256BitVector()) && "Not extracted from 128-/256-bit vector." ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16349, __PRETTY_FUNCTION__));
16350
16351	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16352
16353	for (DenseMap<SDValue, unsigned>::const_iterator
16354	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16355	// Quit if not all elements are used.
16356	if (I->second != FullMask)
16357	return SDValue();
16358	}
16359
16360	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16361
16362	// Cast all vectors into TestVT for PTEST.
16363	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16364	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16365
16366	// If more than one full vector is evaluated, OR them first before PTEST.
16367	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16368	// Each iteration will OR 2 nodes and append the result until there is only
16369	// 1 node left, i.e. the final OR'd value of all vectors.
16370	SDValue LHS = VecIns[Slot];
16371	SDValue RHS = VecIns[Slot + 1];
16372	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16373	}
16374
16375	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16376	}
16377
16378	/// \brief return true if \c Op has a use that doesn't just read flags.
16379	static bool hasNonFlagsUse(SDValue Op) {
16380	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16381	++UI) {
16382	SDNode User = UI;
16383	unsigned UOpNo = UI.getOperandNo();
16384	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16385	// Look pass truncate.
16386	UOpNo = User->use_begin().getOperandNo();
16387	User = *User->use_begin();
16388	}
16389
16390	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16391	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16392	return true;
16393	}
16394	return false;
16395	}
16396
16397	// Emit KTEST instruction for bit vectors on AVX-512
16398	static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16399	const X86Subtarget &Subtarget) {
16400	if (Op.getOpcode() == ISD::BITCAST) {
16401	auto hasKTEST = [&](MVT VT) {
16402	unsigned SizeInBits = VT.getSizeInBits();
16403	return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
16404	(Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
16405	};
16406	SDValue Op0 = Op.getOperand(0);
16407	MVT Op0VT = Op0.getValueType().getSimpleVT();
16408	if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16409	hasKTEST(Op0VT))
16410	return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16411	}
16412	return SDValue();
16413	}
16414
16415	/// Emit nodes that will be selected as "test Op0,Op0", or something
16416	/// equivalent.
16417	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16418	SelectionDAG &DAG) const {
16419	if (Op.getValueType() == MVT::i1) {
16420	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16421	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16422	DAG.getConstant(0, dl, MVT::i8));
16423	}
16424	// CF and OF aren't always set the way we want. Determine which
16425	// of these we need.
16426	bool NeedCF = false;
16427	bool NeedOF = false;
16428	switch (X86CC) {
16429	default: break;
16430	case X86::COND_A: case X86::COND_AE:
16431	case X86::COND_B: case X86::COND_BE:
16432	NeedCF = true;
16433	break;
16434	case X86::COND_G: case X86::COND_GE:
16435	case X86::COND_L: case X86::COND_LE:
16436	case X86::COND_O: case X86::COND_NO: {
16437	// Check if we really need to set the
16438	// Overflow flag. If NoSignedWrap is present
16439	// that is not actually needed.
16440	switch (Op->getOpcode()) {
16441	case ISD::ADD:
16442	case ISD::SUB:
16443	case ISD::MUL:
16444	case ISD::SHL:
16445	if (Op.getNode()->getFlags().hasNoSignedWrap())
16446	break;
16447	LLVM_FALLTHROUGH[[clang::fallthrough]];
16448	default:
16449	NeedOF = true;
16450	break;
16451	}
16452	break;
16453	}
16454	}
16455	// See if we can use the EFLAGS value from the operand instead of
16456	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
16457	// we prove that the arithmetic won't overflow, we can't use OF or CF.
16458	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
16459	// Emit KTEST for bit vectors
16460	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16461	return Node;
16462	// Emit a CMP with 0, which is the TEST pattern.
16463	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16464	DAG.getConstant(0, dl, Op.getValueType()));
16465	}
16466	unsigned Opcode = 0;
16467	unsigned NumOperands = 0;
16468
16469	// Truncate operations may prevent the merge of the SETCC instruction
16470	// and the arithmetic instruction before it. Attempt to truncate the operands
16471	// of the arithmetic instruction and use a reduced bit-width instruction.
16472	bool NeedTruncation = false;
16473	SDValue ArithOp = Op;
16474	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16475	SDValue Arith = Op->getOperand(0);
16476	// Both the trunc and the arithmetic op need to have one user each.
16477	if (Arith->hasOneUse())
16478	switch (Arith.getOpcode()) {
16479	default: break;
16480	case ISD::ADD:
16481	case ISD::SUB:
16482	case ISD::AND:
16483	case ISD::OR:
16484	case ISD::XOR: {
16485	NeedTruncation = true;
16486	ArithOp = Arith;
16487	}
16488	}
16489	}
16490
16491	// Sometimes flags can be set either with an AND or with an SRL/SHL
16492	// instruction. SRL/SHL variant should be preferred for masks longer than this
16493	// number of bits.
16494	const int ShiftToAndMaxMaskWidth = 32;
16495	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);
16496
16497	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16498	// which may be the result of a CAST. We use the variable 'Op', which is the
16499	// non-casted variable when we check for possible users.
16500	switch (ArithOp.getOpcode()) {
16501	case ISD::ADD:
16502	// Due to an isel shortcoming, be conservative if this add is likely to be
16503	// selected as part of a load-modify-store instruction. When the root node
16504	// in a match is a store, isel doesn't know how to remap non-chain non-flag
16505	// uses of other nodes in the match, such as the ADD in this case. This
16506	// leads to the ADD being left around and reselected, with the result being
16507	// two adds in the output. Alas, even if none our users are stores, that
16508	// doesn't prove we're O.K. Ergo, if we have any parents that aren't
16509	// CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16510	// climbing the DAG back to the root, and it doesn't seem to be worth the
16511	// effort.
16512	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16513	UE = Op.getNode()->use_end(); UI != UE; ++UI)
16514	if (UI->getOpcode() != ISD::CopyToReg &&
16515	UI->getOpcode() != ISD::SETCC &&
16516	UI->getOpcode() != ISD::STORE)
16517	goto default_case;
16518
16519	if (ConstantSDNode *C =
16520	dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16521	// An add of one will be selected as an INC.
16522	if (C->isOne() && !Subtarget.slowIncDec()) {
16523	Opcode = X86ISD::INC;
16524	NumOperands = 1;
16525	break;
16526	}
16527
16528	// An add of negative one (subtract of one) will be selected as a DEC.
16529	if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16530	Opcode = X86ISD::DEC;
16531	NumOperands = 1;
16532	break;
16533	}
16534	}
16535
16536	// Otherwise use a regular EFLAGS-setting add.
16537	Opcode = X86ISD::ADD;
16538	NumOperands = 2;
16539	break;
16540	case ISD::SHL:
16541	case ISD::SRL:
16542	// If we have a constant logical shift that's only used in a comparison
16543	// against zero turn it into an equivalent AND. This allows turning it into
16544	// a TEST instruction later.
16545	if (ZeroCheck && Op->hasOneUse() &&
16546	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16547	EVT VT = Op.getValueType();
16548	unsigned BitWidth = VT.getSizeInBits();
16549	unsigned ShAmt = Op->getConstantOperandVal(1);
16550	if (ShAmt >= BitWidth) // Avoid undefined shifts.
16551	break;
16552	APInt Mask = ArithOp.getOpcode() == ISD::SRL
16553	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16554	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16555	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16556	break;
16557	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16558	DAG.getConstant(Mask, dl, VT));
16559	}
16560	break;
16561
16562	case ISD::AND:
16563	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16564	// because a TEST instruction will be better. However, AND should be
16565	// preferred if the instruction can be combined into ANDN.
16566	if (!hasNonFlagsUse(Op)) {
16567	SDValue Op0 = ArithOp->getOperand(0);
16568	SDValue Op1 = ArithOp->getOperand(1);
16569	EVT VT = ArithOp.getValueType();
16570	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
16571	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
16572	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16573
16574	// If we cannot select an ANDN instruction, check if we can replace
16575	// AND+IMM64 with a shift before giving up. This is possible for masks
16576	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16577	if (!isProperAndn) {
16578	if (!ZeroCheck)
16579	break;
16580
16581	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized")((!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized" ) ? static_cast<void> (0) : __assert_fail ("!isa<ConstantSDNode>(Op0) && \"AND node isn't canonicalized\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16581, __PRETTY_FUNCTION__));
16582	auto *CN = dyn_cast<ConstantSDNode>(Op1);
16583	if (!CN)
16584	break;
16585
16586	const APInt &Mask = CN->getAPIntValue();
16587	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16588	break; // Prefer TEST instruction.
16589
16590	unsigned BitWidth = Mask.getBitWidth();
16591	unsigned LeadingOnes = Mask.countLeadingOnes();
16592	unsigned TrailingZeros = Mask.countTrailingZeros();
16593
16594	if (LeadingOnes + TrailingZeros == BitWidth) {
16595	assert(TrailingZeros < VT.getSizeInBits() &&((TrailingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width" ) ? static_cast<void> (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16596, __PRETTY_FUNCTION__))
16596	"Shift amount should be less than the type width")((TrailingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width" ) ? static_cast<void> (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16596, __PRETTY_FUNCTION__));
16597	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16598	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16599	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16600	break;
16601	}
16602
16603	unsigned LeadingZeros = Mask.countLeadingZeros();
16604	unsigned TrailingOnes = Mask.countTrailingOnes();
16605
16606	if (LeadingZeros + TrailingOnes == BitWidth) {
16607	assert(LeadingZeros < VT.getSizeInBits() &&((LeadingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width" ) ? static_cast<void> (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16608, __PRETTY_FUNCTION__))
16608	"Shift amount should be less than the type width")((LeadingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width" ) ? static_cast<void> (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16608, __PRETTY_FUNCTION__));
16609	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16610	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16611	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16612	break;
16613	}
16614
16615	break;
16616	}
16617	}
16618	LLVM_FALLTHROUGH[[clang::fallthrough]];
16619	case ISD::SUB:
16620	case ISD::OR:
16621	case ISD::XOR:
16622	// Due to the ISEL shortcoming noted above, be conservative if this op is
16623	// likely to be selected as part of a load-modify-store instruction.
16624	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16625	UE = Op.getNode()->use_end(); UI != UE; ++UI)
16626	if (UI->getOpcode() == ISD::STORE)
16627	goto default_case;
16628
16629	// Otherwise use a regular EFLAGS-setting instruction.
16630	switch (ArithOp.getOpcode()) {
16631	default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16631);
16632	case ISD::SUB: Opcode = X86ISD::SUB; break;
16633	case ISD::XOR: Opcode = X86ISD::XOR; break;
16634	case ISD::AND: Opcode = X86ISD::AND; break;
16635	case ISD::OR: {
16636	if (!NeedTruncation && ZeroCheck) {
16637	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16638	return EFLAGS;
16639	}
16640	Opcode = X86ISD::OR;
16641	break;
16642	}
16643	}
16644
16645	NumOperands = 2;
16646	break;
16647	case X86ISD::ADD:
16648	case X86ISD::SUB:
16649	case X86ISD::INC:
16650	case X86ISD::DEC:
16651	case X86ISD::OR:
16652	case X86ISD::XOR:
16653	case X86ISD::AND:
16654	return SDValue(Op.getNode(), 1);
16655	default:
16656	default_case:
16657	break;
16658	}
16659
16660	// If we found that truncation is beneficial, perform the truncation and
16661	// update 'Op'.
16662	if (NeedTruncation) {
16663	EVT VT = Op.getValueType();
16664	SDValue WideVal = Op->getOperand(0);
16665	EVT WideVT = WideVal.getValueType();
16666	unsigned ConvertedOp = 0;
16667	// Use a target machine opcode to prevent further DAGCombine
16668	// optimizations that may separate the arithmetic operations
16669	// from the setcc node.
16670	switch (WideVal.getOpcode()) {
16671	default: break;
16672	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16673	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16674	case ISD::AND: ConvertedOp = X86ISD::AND; break;
16675	case ISD::OR: ConvertedOp = X86ISD::OR; break;
16676	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16677	}
16678
16679	if (ConvertedOp) {
16680	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16681	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16682	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16683	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16684	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16685	}
16686	}
16687	}
16688
16689	if (Opcode == 0) {
16690	// Emit KTEST for bit vectors
16691	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16692	return Node;
16693
16694	// Emit a CMP with 0, which is the TEST pattern.
16695	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16696	DAG.getConstant(0, dl, Op.getValueType()));
16697	}
16698	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16699	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16700
16701	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16702	DAG.ReplaceAllUsesWith(Op, New);
16703	return SDValue(New.getNode(), 1);
16704	}
16705
16706	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
16707	/// equivalent.
16708	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16709	const SDLoc &dl, SelectionDAG &DAG) const {
16710	if (isNullConstant(Op1))
16711	return EmitTest(Op0, X86CC, dl, DAG);
16712
16713	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&((!(isa<ConstantSDNode>(Op1) && Op0.getValueType () == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands" ) ? static_cast<void> (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16714, __PRETTY_FUNCTION__))
16714	"Unexpected comparison operation for MVT::i1 operands")((!(isa<ConstantSDNode>(Op1) && Op0.getValueType () == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands" ) ? static_cast<void> (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16714, __PRETTY_FUNCTION__));
16715
16716	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
16717	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
16718	// Only promote the compare up to I32 if it is a 16 bit operation
16719	// with an immediate. 16 bit immediates are to be avoided.
16720	if ((Op0.getValueType() == MVT::i16 &&
16721	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
16722	!DAG.getMachineFunction().getFunction()->optForMinSize() &&
16723	!Subtarget.isAtom()) {
16724	unsigned ExtendOp =
16725	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16726	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16727	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16728	}
16729	// Use SUB instead of CMP to enable CSE between SUB and CMP.
16730	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16731	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16732	Op0, Op1);
16733	return SDValue(Sub.getNode(), 1);
16734	}
16735	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16736	}
16737
16738	/// Convert a comparison if required by the subtarget.
16739	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16740	SelectionDAG &DAG) const {
16741	// If the subtarget does not support the FUCOMI instruction, floating-point
16742	// comparisons have to be converted.
16743	if (Subtarget.hasCMov() \|\|
16744	Cmp.getOpcode() != X86ISD::CMP \|\|
16745	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
16746	!Cmp.getOperand(1).getValueType().isFloatingPoint())
16747	return Cmp;
16748
16749	// The instruction selector will select an FUCOM instruction instead of
16750	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16751	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16752	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16753	SDLoc dl(Cmp);
16754	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16755	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16756	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16757	DAG.getConstant(8, dl, MVT::i8));
16758	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16759
16760	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
16761	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?")((Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasLAHFSAHF() && \"Target doesn't support SAHF or FCOMI?\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16761, __PRETTY_FUNCTION__));
16762	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16763	}
16764
16765	/// Check if replacement of SQRT with RSQRT should be disabled.
16766	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16767	EVT VT = Op.getValueType();
16768
16769	// We never want to use both SQRT and RSQRT instructions for the same input.
16770	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16771	return false;
16772
16773	if (VT.isVector())
16774	return Subtarget.hasFastVectorFSQRT();
16775	return Subtarget.hasFastScalarFSQRT();
16776	}
16777
16778	/// The minimum architected relative accuracy is 2^-12. We need one
16779	/// Newton-Raphson step to have a good float result (24 bits of precision).
16780	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16781	SelectionDAG &DAG, int Enabled,
16782	int &RefinementSteps,
16783	bool &UseOneConstNR,
16784	bool Reciprocal) const {
16785	EVT VT = Op.getValueType();
16786
16787	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16788	// TODO: Add support for AVX512 (v16f32).
16789	// It is likely not profitable to do this for f64 because a double-precision
16790	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16791	// instructions: convert to single, rsqrtss, convert back to double, refine
16792	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16793	// along with FMA, this could be a throughput win.
16794	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
16795	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
16796	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
16797	if (RefinementSteps == ReciprocalEstimate::Unspecified)
16798	RefinementSteps = 1;
16799
16800	UseOneConstNR = false;
16801	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16802	}
16803	return SDValue();
16804	}
16805
16806	/// The minimum architected relative accuracy is 2^-12. We need one
16807	/// Newton-Raphson step to have a good float result (24 bits of precision).
16808	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16809	int Enabled,
16810	int &RefinementSteps) const {
16811	EVT VT = Op.getValueType();
16812
16813	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16814	// TODO: Add support for AVX512 (v16f32).
16815	// It is likely not profitable to do this for f64 because a double-precision
16816	// reciprocal estimate with refinement on x86 prior to FMA requires
16817	// 15 instructions: convert to single, rcpss, convert back to double, refine
16818	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16819	// along with FMA, this could be a throughput win.
16820
16821	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
16822	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
16823	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
16824	// Enable estimate codegen with 1 refinement step for vector division.
16825	// Scalar division estimates are disabled because they break too much
16826	// real-world code. These defaults are intended to match GCC behavior.
16827	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16828	return SDValue();
16829
16830	if (RefinementSteps == ReciprocalEstimate::Unspecified)
16831	RefinementSteps = 1;
16832
16833	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16834	}
16835	return SDValue();
16836	}
16837
16838	/// If we have at least two divisions that use the same divisor, convert to
16839	/// multiplication by a reciprocal. This may need to be adjusted for a given
16840	/// CPU if a division's cost is not at least twice the cost of a multiplication.
16841	/// This is because we still need one division to calculate the reciprocal and
16842	/// then we need two multiplies by that reciprocal as replacements for the
16843	/// original divisions.
16844	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16845	return 2;
16846	}
16847
16848	/// Helper for creating a X86ISD::SETCC node.
16849	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16850	SelectionDAG &DAG) {
16851	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16852	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16853	}
16854
16855	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16856	/// according to equal/not-equal condition code \p CC.
16857	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16858	const SDLoc &dl, SelectionDAG &DAG) {
16859	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16860	// instruction. Since the shift amount is in-range-or-undefined, we know
16861	// that doing a bittest on the i32 value is ok. We extend to i32 because
16862	// the encoding for the i16 version is larger than the i32 version.
16863	// Also promote i16 to i32 for performance / code size reason.
16864	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
16865	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16866
16867	// See if we can use the 32-bit instruction instead of the 64-bit one for a
16868	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
16869	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16870	// known to be zero.
16871	if (Src.getValueType() == MVT::i64 &&
16872	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16873	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16874
16875	// If the operand types disagree, extend the shift amount to match. Since
16876	// BT ignores high bits (like shifts) we can use anyextend.
16877	if (Src.getValueType() != BitNo.getValueType())
16878	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16879
16880	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16881	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16882	return getSETCC(Cond, BT, dl , DAG);
16883	}
16884
16885	/// Result of 'and' is compared against zero. Change to a BT node if possible.
16886	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16887	const SDLoc &dl, SelectionDAG &DAG) {
16888	SDValue Op0 = And.getOperand(0);
16889	SDValue Op1 = And.getOperand(1);
16890	if (Op0.getOpcode() == ISD::TRUNCATE)
16891	Op0 = Op0.getOperand(0);
16892	if (Op1.getOpcode() == ISD::TRUNCATE)
16893	Op1 = Op1.getOperand(0);
16894
16895	SDValue LHS, RHS;
16896	if (Op1.getOpcode() == ISD::SHL)
16897	std::swap(Op0, Op1);
16898	if (Op0.getOpcode() == ISD::SHL) {
16899	if (isOneConstant(Op0.getOperand(0))) {
16900	// If we looked past a truncate, check that it's only truncating away
16901	// known zeros.
16902	unsigned BitWidth = Op0.getValueSizeInBits();
16903	unsigned AndBitWidth = And.getValueSizeInBits();
16904	if (BitWidth > AndBitWidth) {
16905	KnownBits Known;
16906	DAG.computeKnownBits(Op0, Known);
16907	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16908	return SDValue();
16909	}
16910	LHS = Op1;
16911	RHS = Op0.getOperand(1);
16912	}
16913	} else if (Op1.getOpcode() == ISD::Constant) {
16914	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16915	uint64_t AndRHSVal = AndRHS->getZExtValue();
16916	SDValue AndLHS = Op0;
16917
16918	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16919	LHS = AndLHS.getOperand(0);
16920	RHS = AndLHS.getOperand(1);
16921	}
16922
16923	// Use BT if the immediate can't be encoded in a TEST instruction.
16924	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16925	LHS = AndLHS;
16926	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16927	}
16928	}
16929
16930	if (LHS.getNode())
16931	return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16932
16933	return SDValue();
16934	}
16935
16936	// Convert (truncate (srl X, N) to i1) to (bt X, N)
16937	static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16938	const SDLoc &dl, SelectionDAG &DAG) {
16939
16940	assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&((Op.getOpcode() == ISD::TRUNCATE && Op.getValueType( ) == MVT::i1 && "Expected TRUNCATE to i1 node") ? static_cast <void> (0) : __assert_fail ("Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && \"Expected TRUNCATE to i1 node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16941, __PRETTY_FUNCTION__))
16941	"Expected TRUNCATE to i1 node")((Op.getOpcode() == ISD::TRUNCATE && Op.getValueType( ) == MVT::i1 && "Expected TRUNCATE to i1 node") ? static_cast <void> (0) : __assert_fail ("Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && \"Expected TRUNCATE to i1 node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16941, __PRETTY_FUNCTION__));
16942
16943	if (Op.getOperand(0).getOpcode() != ISD::SRL)
16944	return SDValue();
16945
16946	SDValue ShiftRight = Op.getOperand(0);
16947	return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16948	CC, dl, DAG);
16949	}
16950
16951	/// Result of 'and' or 'trunc to i1' is compared against zero.
16952	/// Change to a BT node if possible.
16953	SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16954	const SDLoc &dl, SelectionDAG &DAG) const {
16955	if (Op.getOpcode() == ISD::AND)
16956	return LowerAndToBT(Op, CC, dl, DAG);
16957	if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16958	return LowerTruncateToBT(Op, CC, dl, DAG);
16959	return SDValue();
16960	}
16961
16962	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16963	/// CMPs.
16964	static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16965	SDValue &Op1) {
16966	unsigned SSECC;
16967	bool Swap = false;
16968
16969	// SSE Condition code mapping:
16970	// 0 - EQ
16971	// 1 - LT
16972	// 2 - LE
16973	// 3 - UNORD
16974	// 4 - NEQ
16975	// 5 - NLT
16976	// 6 - NLE
16977	// 7 - ORD
16978	switch (SetCCOpcode) {
16979	default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 16979);
16980	case ISD::SETOEQ:
16981	case ISD::SETEQ: SSECC = 0; break;
16982	case ISD::SETOGT:
16983	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
16984	case ISD::SETLT:
16985	case ISD::SETOLT: SSECC = 1; break;
16986	case ISD::SETOGE:
16987	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
16988	case ISD::SETLE:
16989	case ISD::SETOLE: SSECC = 2; break;
16990	case ISD::SETUO: SSECC = 3; break;
16991	case ISD::SETUNE:
16992	case ISD::SETNE: SSECC = 4; break;
16993	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
16994	case ISD::SETUGE: SSECC = 5; break;
16995	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
16996	case ISD::SETUGT: SSECC = 6; break;
16997	case ISD::SETO: SSECC = 7; break;
16998	case ISD::SETUEQ:
16999	case ISD::SETONE: SSECC = 8; break;
17000	}
17001	if (Swap)
17002	std::swap(Op0, Op1);
17003
17004	return SSECC;
17005	}
17006
17007	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17008	/// concatenate the result back.
17009	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17010	MVT VT = Op.getSimpleValueType();
17011
17012	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation") ? static_cast <void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17013, __PRETTY_FUNCTION__))
17013	"Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation") ? static_cast <void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17013, __PRETTY_FUNCTION__));
17014
17015	unsigned NumElems = VT.getVectorNumElements();
17016	SDLoc dl(Op);
17017	SDValue CC = Op.getOperand(2);
17018
17019	// Extract the LHS vectors
17020	SDValue LHS = Op.getOperand(0);
17021	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17022	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17023
17024	// Extract the RHS vectors
17025	SDValue RHS = Op.getOperand(1);
17026	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17027	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17028
17029	// Issue the operation on the smaller types and concatenate the result back
17030	MVT EltVT = VT.getVectorElementType();
17031	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17032	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17033	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17034	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17035	}
17036
17037	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17038	SDValue Op0 = Op.getOperand(0);
17039	SDValue Op1 = Op.getOperand(1);
17040	SDValue CC = Op.getOperand(2);
17041	MVT VT = Op.getSimpleValueType();
17042	SDLoc dl(Op);
17043
17044	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation") ? static_cast <void> (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17045, __PRETTY_FUNCTION__))
17045	"Unexpected type for boolean compare operation")((Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation") ? static_cast <void> (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17045, __PRETTY_FUNCTION__));
17046	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17047	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17048	DAG.getConstant(-1, dl, VT));
17049	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17050	DAG.getConstant(-1, dl, VT));
17051	switch (SetCCOpcode) {
17052	default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17052);
17053	case ISD::SETEQ:
17054	// (x == y) -> ~(x ^ y)
17055	return DAG.getNode(ISD::XOR, dl, VT,
17056	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17057	DAG.getConstant(-1, dl, VT));
17058	case ISD::SETNE:
17059	// (x != y) -> (x ^ y)
17060	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17061	case ISD::SETUGT:
17062	case ISD::SETGT:
17063	// (x > y) -> (x & ~y)
17064	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17065	case ISD::SETULT:
17066	case ISD::SETLT:
17067	// (x < y) -> (~x & y)
17068	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17069	case ISD::SETULE:
17070	case ISD::SETLE:
17071	// (x <= y) -> (~x \| y)
17072	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17073	case ISD::SETUGE:
17074	case ISD::SETGE:
17075	// (x >=y) -> (x \| ~y)
17076	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17077	}
17078	}
17079
17080	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17081
17082	SDValue Op0 = Op.getOperand(0);
17083	SDValue Op1 = Op.getOperand(1);
17084	SDValue CC = Op.getOperand(2);
17085	MVT VT = Op.getSimpleValueType();
17086	SDLoc dl(Op);
17087
17088	assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17089, __PRETTY_FUNCTION__))
17089	"Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17089, __PRETTY_FUNCTION__));
17090
17091	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17092	unsigned Opc = 0;
17093	bool Unsigned = false;
17094	bool Swap = false;
17095	unsigned SSECC;
17096	switch (SetCCOpcode) {
17097	default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17097);
17098	case ISD::SETNE: SSECC = 4; break;
17099	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17100	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17101	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17102	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17103	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17104	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17105	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17106	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17107	case ISD::SETLE: SSECC = 2; break;
17108	}
17109
17110	if (Swap)
17111	std::swap(Op0, Op1);
17112	if (Opc)
17113	return DAG.getNode(Opc, dl, VT, Op0, Op1);
17114	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17115	return DAG.getNode(Opc, dl, VT, Op0, Op1,
17116	DAG.getConstant(SSECC, dl, MVT::i8));
17117	}
17118
17119	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17120	/// operand \p Op1. If non-trivial (for example because it's not constant)
17121	/// return an empty value.
17122	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17123	SelectionDAG &DAG) {
17124	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17125	if (!BV)
17126	return SDValue();
17127
17128	MVT VT = Op1.getSimpleValueType();
17129	MVT EVT = VT.getVectorElementType();
17130	unsigned n = VT.getVectorNumElements();
17131	SmallVector<SDValue, 8> ULTOp1;
17132
17133	for (unsigned i = 0; i < n; ++i) {
17134	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17135	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
17136	return SDValue();
17137
17138	// Avoid underflow.
17139	APInt Val = Elt->getAPIntValue();
17140	if (Val == 0)
17141	return SDValue();
17142
17143	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17144	}
17145
17146	return DAG.getBuildVector(VT, dl, ULTOp1);
17147	}
17148
17149	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17150	SelectionDAG &DAG) {
17151	SDValue Op0 = Op.getOperand(0);
17152	SDValue Op1 = Op.getOperand(1);
17153	SDValue CC = Op.getOperand(2);
17154	MVT VT = Op.getSimpleValueType();
17155	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17156	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17157	SDLoc dl(Op);
17158
17159	if (isFP) {
17160	#ifndef NDEBUG
17161	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17162	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64)((EltVT == MVT::f32 \|\| EltVT == MVT::f64) ? static_cast<void > (0) : __assert_fail ("EltVT == MVT::f32 \|\| EltVT == MVT::f64" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17162, __PRETTY_FUNCTION__));
17163	#endif
17164
17165	unsigned Opc;
17166	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17167	assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17167, __PRETTY_FUNCTION__));
17168	Opc = X86ISD::CMPM;
17169	} else {
17170	Opc = X86ISD::CMPP;
17171	// The SSE/AVX packed FP comparison nodes are defined with a
17172	// floating-point vector result that matches the operand type. This allows
17173	// them to work with an SSE1 target (integer vector types are not legal).
17174	VT = Op0.getSimpleValueType();
17175	}
17176
17177	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17178	// emit two comparisons and a logic op to tie them together.
17179	// TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17180	// available.
17181	SDValue Cmp;
17182	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17183	if (SSECC == 8) {
17184	// LLVM predicate is SETUEQ or SETONE.
17185	unsigned CC0, CC1;
17186	unsigned CombineOpc;
17187	if (Cond == ISD::SETUEQ) {
17188	CC0 = 3; // UNORD
17189	CC1 = 0; // EQ
17190	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17191	static_cast<unsigned>(ISD::OR);
17192	} else {
17193	assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail ("Cond == ISD::SETONE", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17193, __PRETTY_FUNCTION__));
17194	CC0 = 7; // ORD
17195	CC1 = 4; // NEQ
17196	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17197	static_cast<unsigned>(ISD::AND);
17198	}
17199
17200	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17201	DAG.getConstant(CC0, dl, MVT::i8));
17202	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17203	DAG.getConstant(CC1, dl, MVT::i8));
17204	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17205	} else {
17206	// Handle all other FP comparisons here.
17207	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17208	DAG.getConstant(SSECC, dl, MVT::i8));
17209	}
17210
17211	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17212	// result type of SETCC. The bitcast is expected to be optimized away
17213	// during combining/isel.
17214	if (Opc == X86ISD::CMPP)
17215	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17216
17217	return Cmp;
17218	}
17219
17220	MVT VTOp0 = Op0.getSimpleValueType();
17221	assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!" ) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17222, __PRETTY_FUNCTION__))
17222	"Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!" ) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17222, __PRETTY_FUNCTION__));
17223	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17224, __PRETTY_FUNCTION__))
17224	"Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17224, __PRETTY_FUNCTION__));
17225
17226	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17227	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17228	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17229	// legalizer firstly checks if the first operand in input to the setcc has
17230	// a legal type. If so, then it promotes the return type to that same type.
17231	// Otherwise, the return type is promoted to the 'next legal type' which,
17232	// for a vector of MVT::i1 is always a 128-bit integer vector type.
17233	//
17234	// We reach this code only if the following two conditions are met:
17235	// 1. Both return type and operand type have been promoted to wider types
17236	// by the type legalizer.
17237	// 2. The original operand type has been promoted to a 256-bit vector.
17238	//
17239	// Note that condition 2. only applies for AVX targets.
17240	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17241	return DAG.getZExtOrTrunc(NewOp, dl, VT);
17242	}
17243
17244	// The non-AVX512 code below works under the assumption that source and
17245	// destination types are the same.
17246	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&(((Subtarget.hasAVX512() \|\| (VT == VTOp0)) && "Value types for source and destination must be the same!" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() \|\| (VT == VTOp0)) && \"Value types for source and destination must be the same!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17247, __PRETTY_FUNCTION__))
17247	"Value types for source and destination must be the same!")(((Subtarget.hasAVX512() \|\| (VT == VTOp0)) && "Value types for source and destination must be the same!" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() \|\| (VT == VTOp0)) && \"Value types for source and destination must be the same!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17247, __PRETTY_FUNCTION__));
17248
17249	// Break 256-bit integer vector compare into smaller ones.
17250	if (VT.is256BitVector() && !Subtarget.hasInt256())
17251	return Lower256IntVSETCC(Op, DAG);
17252
17253	// Operands are boolean (vectors of i1)
17254	MVT OpVT = Op1.getSimpleValueType();
17255	if (OpVT.getVectorElementType() == MVT::i1)
17256	return LowerBoolVSETCC_AVX512(Op, DAG);
17257
17258	// The result is boolean, but operands are int/float
17259	if (VT.getVectorElementType() == MVT::i1) {
17260	// In AVX-512 architecture setcc returns mask with i1 elements,
17261	// But there is no compare instruction for i8 and i16 elements in KNL.
17262	// In this case use SSE compare
17263	bool UseAVX512Inst =
17264	(OpVT.is512BitVector() \|\|
17265	OpVT.getScalarSizeInBits() >= 32 \|\|
17266	(Subtarget.hasBWI() && Subtarget.hasVLX()));
17267
17268	if (UseAVX512Inst)
17269	return LowerIntVSETCC_AVX512(Op, DAG);
17270
17271	return DAG.getNode(ISD::TRUNCATE, dl, VT,
17272	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17273	}
17274
17275	// Lower using XOP integer comparisons.
17276	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
17277	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
17278	// Translate compare code to XOP PCOM compare mode.
17279	unsigned CmpMode = 0;
17280	switch (Cond) {
17281	default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17281);
17282	case ISD::SETULT:
17283	case ISD::SETLT: CmpMode = 0x00; break;
17284	case ISD::SETULE:
17285	case ISD::SETLE: CmpMode = 0x01; break;
17286	case ISD::SETUGT:
17287	case ISD::SETGT: CmpMode = 0x02; break;
17288	case ISD::SETUGE:
17289	case ISD::SETGE: CmpMode = 0x03; break;
17290	case ISD::SETEQ: CmpMode = 0x04; break;
17291	case ISD::SETNE: CmpMode = 0x05; break;
17292	}
17293
17294	// Are we comparing unsigned or signed integers?
17295	unsigned Opc =
17296	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17297
17298	return DAG.getNode(Opc, dl, VT, Op0, Op1,
17299	DAG.getConstant(CmpMode, dl, MVT::i8));
17300	}
17301
17302	// We are handling one of the integer comparisons here. Since SSE only has
17303	// GT and EQ comparisons for integer, swapping operands and multiple
17304	// operations may be required for some comparisons.
17305	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17306	: X86ISD::PCMPGT;
17307	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
17308	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
17309	bool Invert = Cond == ISD::SETNE \|\|
17310	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17311
17312	// If both operands are known non-negative, then an unsigned compare is the
17313	// same as a signed compare and there's no need to flip signbits.
17314	// TODO: We could check for more general simplifications here since we're
17315	// computing known bits.
17316	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17317	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17318
17319	// Special case: Use min/max operations for SETULE/SETUGE
17320	MVT VET = VT.getVectorElementType();
17321	bool HasMinMax =
17322	(Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) \|\|
17323	(Subtarget.hasSSE2() && (VET == MVT::i8));
17324	bool MinMax = false;
17325	if (HasMinMax) {
17326	switch (Cond) {
17327	default: break;
17328	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17329	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17330	}
17331
17332	if (MinMax)
17333	Swap = Invert = FlipSigns = false;
17334	}
17335
17336	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
17337	bool Subus = false;
17338	if (!MinMax && HasSubus) {
17339	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17340	// Op0 u<= Op1:
17341	// t = psubus Op0, Op1
17342	// pcmpeq t, <0..0>
17343	switch (Cond) {
17344	default: break;
17345	case ISD::SETULT: {
17346	// If the comparison is against a constant we can turn this into a
17347	// setule. With psubus, setule does not require a swap. This is
17348	// beneficial because the constant in the register is no longer
17349	// destructed as the destination so it can be hoisted out of a loop.
17350	// Only do this pre-AVX since vpcmp* is no longer destructive.
17351	if (Subtarget.hasAVX())
17352	break;
17353	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17354	Op1 = ULEOp1;
17355	Subus = true; Invert = false; Swap = false;
17356	}
17357	break;
17358	}
17359	// Psubus is better than flip-sign because it requires no inversion.
17360	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17361	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17362	}
17363
17364	if (Subus) {
17365	Opc = X86ISD::SUBUS;
17366	FlipSigns = false;
17367	}
17368	}
17369
17370	if (Swap)
17371	std::swap(Op0, Op1);
17372
17373	// Check that the operation in question is available (most are plain SSE2,
17374	// but PCMPGTQ and PCMPEQQ have different requirements).
17375	if (VT == MVT::v2i64) {
17376	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17377	assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17377, __PRETTY_FUNCTION__));
17378
17379	// First cast everything to the right type.
17380	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17381	Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17382
17383	// Since SSE has no unsigned integer comparisons, we need to flip the sign
17384	// bits of the inputs before performing those operations. The lower
17385	// compare is always unsigned.
17386	SDValue SB;
17387	if (FlipSigns) {
17388	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17389	} else {
17390	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17391	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17392	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17393	}
17394	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17395	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17396
17397	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
17398	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17399	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17400
17401	// Create masks for only the low parts/high parts of the 64 bit integers.
17402	static const int MaskHi[] = { 1, 1, 3, 3 };
17403	static const int MaskLo[] = { 0, 0, 2, 2 };
17404	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17405	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17406	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17407
17408	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17409	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17410
17411	if (Invert)
17412	Result = DAG.getNOT(dl, Result, MVT::v4i32);
17413
17414	return DAG.getBitcast(VT, Result);
17415	}
17416
17417	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17418	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17419	// pcmpeqd + pshufd + pand.
17420	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17420, __PRETTY_FUNCTION__));
17421
17422	// First cast everything to the right type.
17423	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17424	Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17425
17426	// Do the compare.
17427	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17428
17429	// Make sure the lower and upper halves are both all-ones.
17430	static const int Mask[] = { 1, 0, 3, 2 };
17431	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17432	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17433
17434	if (Invert)
17435	Result = DAG.getNOT(dl, Result, MVT::v4i32);
17436
17437	return DAG.getBitcast(VT, Result);
17438	}
17439	}
17440
17441	// Since SSE has no unsigned integer comparisons, we need to flip the sign
17442	// bits of the inputs before performing those operations.
17443	if (FlipSigns) {
17444	MVT EltVT = VT.getVectorElementType();
17445	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17446	VT);
17447	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17448	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17449	}
17450
17451	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17452
17453	// If the logical-not of the result is required, perform that now.
17454	if (Invert)
17455	Result = DAG.getNOT(dl, Result, VT);
17456
17457	if (MinMax)
17458	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17459
17460	if (Subus)
17461	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17462	getZeroVector(VT, Subtarget, DAG, dl));
17463
17464	return Result;
17465	}
17466
17467	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17468
17469	MVT VT = Op.getSimpleValueType();
17470
17471	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17472
17473	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17473, __PRETTY_FUNCTION__));
17474	SDValue Op0 = Op.getOperand(0);
17475	SDValue Op1 = Op.getOperand(1);
17476	SDLoc dl(Op);
17477	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17478
17479	// Optimize to BT if possible.
17480	// Lower (X & (1 << N)) == 0 to BT(X, N).
17481	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
17482	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
17483	// Lower (trunc (X >> N) to i1) to BT(X, N).
17484	if (Op0.hasOneUse() && isNullConstant(Op1) &&
17485	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
17486	if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17487	if (VT == MVT::i1)
17488	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17489	return NewSetCC;
17490	}
17491	}
17492
17493	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17494	// these.
17495	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
17496	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
17497
17498	// If the input is a setcc, then reuse the input setcc or use a new one with
17499	// the inverted condition.
17500	if (Op0.getOpcode() == X86ISD::SETCC) {
17501	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17502	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17503	if (!Invert)
17504	return Op0;
17505
17506	CCode = X86::GetOppositeBranchCondition(CCode);
17507	SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17508	if (VT == MVT::i1)
17509	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17510	return SetCC;
17511	}
17512	}
17513	if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
17514	if (isOneConstant(Op1)) {
17515	ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17516	return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17517	}
17518	if (!isNullConstant(Op1)) {
17519	SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17520	return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17521	}
17522	}
17523
17524	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17525	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17526	if (X86CC == X86::COND_INVALID)
17527	return SDValue();
17528
17529	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17530	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17531	SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17532	if (VT == MVT::i1)
17533	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17534	return SetCC;
17535	}
17536
17537	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17538	SDValue LHS = Op.getOperand(0);
17539	SDValue RHS = Op.getOperand(1);
17540	SDValue Carry = Op.getOperand(2);
17541	SDValue Cond = Op.getOperand(3);
17542	SDLoc DL(Op);
17543
17544	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only." ) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17544, __PRETTY_FUNCTION__));
17545	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17546
17547	// Recreate the carry if needed.
17548	EVT CarryVT = Carry.getValueType();
17549	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17550	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17551	Carry, DAG.getConstant(NegOne, DL, CarryVT));
17552
17553	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17554	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17555	SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17556	if (Op.getSimpleValueType() == MVT::i1)
17557	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17558	return SetCC;
17559	}
17560
17561	/// Return true if opcode is a X86 logical comparison.
17562	static bool isX86LogicalCmp(SDValue Op) {
17563	unsigned Opc = Op.getOpcode();
17564	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
17565	Opc == X86ISD::SAHF)
17566	return true;
17567	if (Op.getResNo() == 1 &&
17568	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
17569	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
17570	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
17571	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
17572	return true;
17573
17574	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17575	return true;
17576
17577	return false;
17578	}
17579
17580	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17581	if (V.getOpcode() != ISD::TRUNCATE)
17582	return false;
17583
17584	SDValue VOp0 = V.getOperand(0);
17585	unsigned InBits = VOp0.getValueSizeInBits();
17586	unsigned Bits = V.getValueSizeInBits();
17587	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17588	}
17589
17590	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17591	bool AddTest = true;
17592	SDValue Cond = Op.getOperand(0);
17593	SDValue Op1 = Op.getOperand(1);
17594	SDValue Op2 = Op.getOperand(2);
17595	SDLoc DL(Op);
17596	MVT VT = Op1.getSimpleValueType();
17597	SDValue CC;
17598
17599	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17600	// are available or VBLENDV if AVX is available.
17601	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17602	if (Cond.getOpcode() == ISD::SETCC &&
17603	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
17604	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
17605	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17606	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17607	int SSECC = translateX86FSETCC(
17608	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17609
17610	if (SSECC != 8) {
17611	if (Subtarget.hasAVX512()) {
17612	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17613	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17614	return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17615	DL, VT, Cmp, Op1, Op2);
17616	}
17617
17618	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17619	DAG.getConstant(SSECC, DL, MVT::i8));
17620
17621	// If we have AVX, we can use a variable vector select (VBLENDV) instead
17622	// of 3 logic instructions for size savings and potentially speed.
17623	// Unfortunately, there is no scalar form of VBLENDV.
17624
17625	// If either operand is a constant, don't try this. We can expect to
17626	// optimize away at least one of the logic instructions later in that
17627	// case, so that sequence would be faster than a variable blend.
17628
17629	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17630	// uses XMM0 as the selection register. That may need just as many
17631	// instructions as the AND/ANDN/OR sequence due to register moves, so
17632	// don't bother.
17633
17634	if (Subtarget.hasAVX() &&
17635	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17636
17637	// Convert to vectors, do a VSELECT, and convert back to scalar.
17638	// All of the conversions should be optimized away.
17639
17640	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17641	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17642	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17643	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17644
17645	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17646	VCmp = DAG.getBitcast(VCmpVT, VCmp);
17647
17648	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17649
17650	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17651	VSel, DAG.getIntPtrConstant(0, DL));
17652	}
17653	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17654	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17655	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17656	}
17657	}
17658
17659	// AVX512 fallback is to lower selects of scalar floats to masked moves.
17660	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
17661	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17662	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17663	}
17664
17665	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17666	SDValue Op1Scalar;
17667	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17668	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17669	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17670	Op1Scalar = Op1.getOperand(0);
17671	SDValue Op2Scalar;
17672	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17673	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17674	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17675	Op2Scalar = Op2.getOperand(0);
17676	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17677	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17678	Op1Scalar, Op2Scalar);
17679	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17680	return DAG.getBitcast(VT, newSelect);
17681	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17682	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17683	DAG.getIntPtrConstant(0, DL));
17684	}
17685	}
17686
17687	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
17688	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17689	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17690	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17691	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17692	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17693	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17694	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17695	}
17696
17697	if (Cond.getOpcode() == ISD::SETCC) {
17698	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17699	Cond = NewCond;
17700	// If the condition was updated, it's possible that the operands of the
17701	// select were also updated (for example, EmitTest has a RAUW). Refresh
17702	// the local references to the select operands in case they got stale.
17703	Op1 = Op.getOperand(1);
17704	Op2 = Op.getOperand(2);
17705	}
17706	}
17707
17708	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
17709	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
17710	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
17711	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
17712	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17713	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
17714	if (Cond.getOpcode() == X86ISD::SETCC &&
17715	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17716	isNullConstant(Cond.getOperand(1).getOperand(1))) {
17717	SDValue Cmp = Cond.getOperand(1);
17718	unsigned CondCode =
17719	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17720
17721	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
17722	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
17723	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17724	SDValue CmpOp0 = Cmp.getOperand(0);
17725
17726	// Apply further optimizations for special cases
17727	// (select (x != 0), -1, 0) -> neg & sbb
17728	// (select (x == 0), 0, -1) -> neg & sbb
17729	if (isNullConstant(Y) &&
17730	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17731	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17732	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17733	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17734	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17735	DAG.getConstant(X86::COND_B, DL, MVT::i8),
17736	SDValue(Neg.getNode(), 1));
17737	return Res;
17738	}
17739
17740	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17741	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17742	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17743
17744	SDValue Res = // Res = 0 or -1.
17745	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17746	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17747
17748	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17749	Res = DAG.getNOT(DL, Res, Res.getValueType());
17750
17751	if (!isNullConstant(Op2))
17752	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17753	return Res;
17754	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17755	Cmp.getOperand(0).getOpcode() == ISD::AND &&
17756	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17757	SDValue CmpOp0 = Cmp.getOperand(0);
17758	SDValue Src1, Src2;
17759	// true if Op2 is XOR or OR operator and one of its operands
17760	// is equal to Op1
17761	// ( a , a op b) \|\| ( b , a op b)
17762	auto isOrXorPattern = [&]() {
17763	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
17764	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
17765	Src1 =
17766	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17767	Src2 = Op1;
17768	return true;
17769	}
17770	return false;
17771	};
17772
17773	if (isOrXorPattern()) {
17774	SDValue Neg;
17775	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17776	// we need mask of all zeros or ones with same size of the other
17777	// operands.
17778	if (CmpSz > VT.getSizeInBits())
17779	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17780	else if (CmpSz < VT.getSizeInBits())
17781	Neg = DAG.getNode(ISD::AND, DL, VT,
17782	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17783	DAG.getConstant(1, DL, VT));
17784	else
17785	Neg = CmpOp0;
17786	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17787	Neg); // -(and (x, 0x1))
17788	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17789	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17790	}
17791	}
17792	}
17793
17794	// Look past (and (setcc_carry (cmp ...)), 1).
17795	if (Cond.getOpcode() == ISD::AND &&
17796	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17797	isOneConstant(Cond.getOperand(1)))
17798	Cond = Cond.getOperand(0);
17799
17800	// If condition flag is set by a X86ISD::CMP, then use it as the condition
17801	// setting operand in place of the X86ISD::SETCC.
17802	unsigned CondOpcode = Cond.getOpcode();
17803	if (CondOpcode == X86ISD::SETCC \|\|
17804	CondOpcode == X86ISD::SETCC_CARRY) {
17805	CC = Cond.getOperand(0);
17806
17807	SDValue Cmp = Cond.getOperand(1);
17808	unsigned Opc = Cmp.getOpcode();
17809	MVT VT = Op.getSimpleValueType();
17810
17811	bool IllegalFPCMov = false;
17812	if (VT.isFloatingPoint() && !VT.isVector() &&
17813	!isScalarFPTypeInSSEReg(VT)) // FPStack?
17814	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17815
17816	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
17817	Opc == X86ISD::BT) { // FIXME
17818	Cond = Cmp;
17819	AddTest = false;
17820	}
17821	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
17822	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
17823	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
17824	Cond.getOperand(0).getValueType() != MVT::i8)) {
17825	SDValue LHS = Cond.getOperand(0);
17826	SDValue RHS = Cond.getOperand(1);
17827	unsigned X86Opcode;
17828	unsigned X86Cond;
17829	SDVTList VTs;
17830	switch (CondOpcode) {
17831	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17832	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17833	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17834	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17835	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17836	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17837	default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17837);
17838	}
17839	if (CondOpcode == ISD::UMULO)
17840	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17841	MVT::i32);
17842	else
17843	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17844
17845	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17846
17847	if (CondOpcode == ISD::UMULO)
17848	Cond = X86Op.getValue(2);
17849	else
17850	Cond = X86Op.getValue(1);
17851
17852	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17853	AddTest = false;
17854	}
17855
17856	if (AddTest) {
17857	// Look past the truncate if the high bits are known zero.
17858	if (isTruncWithZeroHighBitsInput(Cond, DAG))
17859	Cond = Cond.getOperand(0);
17860
17861	// We know the result of AND is compared against zero. Try to match
17862	// it to BT.
17863	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17864	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17865	CC = NewSetCC.getOperand(0);
17866	Cond = NewSetCC.getOperand(1);
17867	AddTest = false;
17868	}
17869	}
17870	}
17871
17872	if (AddTest) {
17873	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17874	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17875	}
17876
17877	// a < b ? -1 : 0 -> RES = ~setcc_carry
17878	// a < b ? 0 : -1 -> RES = setcc_carry
17879	// a >= b ? -1 : 0 -> RES = setcc_carry
17880	// a >= b ? 0 : -1 -> RES = ~setcc_carry
17881	if (Cond.getOpcode() == X86ISD::SUB) {
17882	Cond = ConvertCmpIfNecessary(Cond, DAG);
17883	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17884
17885	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
17886	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
17887	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
17888	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17889	DAG.getConstant(X86::COND_B, DL, MVT::i8),
17890	Cond);
17891	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17892	return DAG.getNOT(DL, Res, Res.getValueType());
17893	return Res;
17894	}
17895	}
17896
17897	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17898	// widen the cmov and push the truncate through. This avoids introducing a new
17899	// branch during isel and doesn't add any extensions.
17900	if (Op.getValueType() == MVT::i8 &&
17901	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17902	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17903	if (T1.getValueType() == T2.getValueType() &&
17904	// Blacklist CopyFromReg to avoid partial register stalls.
17905	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17906	SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17907	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17908	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17909	}
17910	}
17911
17912	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17913	// condition is true.
17914	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17915	SDValue Ops[] = { Op2, Op1, CC, Cond };
17916	return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17917	}
17918
17919	static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17920	const X86Subtarget &Subtarget,
17921	SelectionDAG &DAG) {
17922	MVT VT = Op->getSimpleValueType(0);
17923	SDValue In = Op->getOperand(0);
17924	MVT InVT = In.getSimpleValueType();
17925	MVT VTElt = VT.getVectorElementType();
17926	MVT InVTElt = InVT.getVectorElementType();
17927	SDLoc dl(Op);
17928
17929	// SKX processor
17930	if ((InVTElt == MVT::i1) &&
17931	(((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) \|\|
17932
17933	((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17934
17935	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17936
17937	unsigned NumElts = VT.getVectorNumElements();
17938
17939	if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17940	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI())) {
17941	if (In.getOpcode() == X86ISD::VSEXT \|\| In.getOpcode() == X86ISD::VZEXT)
17942	return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17943	return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17944	}
17945
17946	if (InVTElt != MVT::i1)
17947	return SDValue();
17948
17949	MVT ExtVT = VT;
17950	if (!VT.is512BitVector() && !Subtarget.hasVLX())
17951	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17952
17953	SDValue V;
17954	if (Subtarget.hasDQI()) {
17955	V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17956	assert(!VT.is512BitVector() && "Unexpected vector type")((!VT.is512BitVector() && "Unexpected vector type") ? static_cast<void> (0) : __assert_fail ("!VT.is512BitVector() && \"Unexpected vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17956, __PRETTY_FUNCTION__));
17957	} else {
17958	SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17959	SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17960	V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17961	if (ExtVT == VT)
17962	return V;
17963	}
17964
17965	return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17966	}
17967
17968	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17969	// For sign extend this needs to handle all vector sizes and SSE4.1 and
17970	// non-SSE4.1 targets. For zero extend this should only handle inputs of
17971	// MVT::v64i8 when BWI is not supported, but AVX512 is.
17972	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17973	const X86Subtarget &Subtarget,
17974	SelectionDAG &DAG) {
17975	SDValue In = Op->getOperand(0);
17976	MVT VT = Op->getSimpleValueType(0);
17977	MVT InVT = In.getSimpleValueType();
17978	assert(VT.getSizeInBits() == InVT.getSizeInBits())((VT.getSizeInBits() == InVT.getSizeInBits()) ? static_cast< void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17978, __PRETTY_FUNCTION__));
17979
17980	MVT SVT = VT.getVectorElementType();
17981	MVT InSVT = InVT.getVectorElementType();
17982	assert(SVT.getSizeInBits() > InSVT.getSizeInBits())((SVT.getSizeInBits() > InSVT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 17982, __PRETTY_FUNCTION__));
17983
17984	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17985	return SDValue();
17986	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17987	return SDValue();
17988	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17989	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
17990	!(VT.is512BitVector() && Subtarget.hasAVX512()))
17991	return SDValue();
17992
17993	SDLoc dl(Op);
17994
17995	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
17996	// For 512-bit vectors, we need 128-bits or 256-bits.
17997	if (VT.getSizeInBits() > 128) {
17998	// Input needs to be at least the same number of elements as output, and
17999	// at least 128-bits.
18000	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18001	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18002	}
18003
18004	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|(((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!") ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\| InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18005, __PRETTY_FUNCTION__))
18005	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!")(((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!") ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\| InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18005, __PRETTY_FUNCTION__));
18006
18007	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18008	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18009	// need to be handled here for 256/512-bit results.
18010	if (Subtarget.hasInt256()) {
18011	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension" ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18011, __PRETTY_FUNCTION__));
18012	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18013	X86ISD::VSEXT : X86ISD::VZEXT;
18014	return DAG.getNode(ExtOpc, dl, VT, In);
18015	}
18016
18017	// We should only get here for sign extend.
18018	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&((Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18019, __PRETTY_FUNCTION__))
18019	"Unexpected opcode!")((Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18019, __PRETTY_FUNCTION__));
18020
18021	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18022	SDValue Curr = In;
18023	MVT CurrVT = InVT;
18024
18025	// As SRAI is only available on i16/i32 types, we expand only up to i32
18026	// and handle i64 separately.
18027	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18028	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18029	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18030	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18031	Curr = DAG.getBitcast(CurrVT, Curr);
18032	}
18033
18034	SDValue SignExt = Curr;
18035	if (CurrVT != InVT) {
18036	unsigned SignExtShift =
18037	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18038	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18039	DAG.getConstant(SignExtShift, dl, MVT::i8));
18040	}
18041
18042	if (CurrVT == VT)
18043	return SignExt;
18044
18045	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18046	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18047	DAG.getConstant(31, dl, MVT::i8));
18048	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18049	return DAG.getBitcast(VT, Ext);
18050	}
18051
18052	return SDValue();
18053	}
18054
18055	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18056	SelectionDAG &DAG) {
18057	MVT VT = Op->getSimpleValueType(0);
18058	SDValue In = Op->getOperand(0);
18059	MVT InVT = In.getSimpleValueType();
18060	SDLoc dl(Op);
18061
18062	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
18063	return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18064
18065	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
18066	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
18067	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8))
18068	return SDValue();
18069
18070	if (Subtarget.hasInt256())
18071	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18072
18073	// Optimize vectors in AVX mode
18074	// Sign extend v8i16 to v8i32 and
18075	// v4i32 to v4i64
18076	//
18077	// Divide input vector into two parts
18078	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18079	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18080	// concat the vectors to original VT
18081
18082	unsigned NumElems = InVT.getVectorNumElements();
18083	SDValue Undef = DAG.getUNDEF(InVT);
18084
18085	SmallVector<int,8> ShufMask1(NumElems, -1);
18086	for (unsigned i = 0; i != NumElems/2; ++i)
18087	ShufMask1[i] = i;
18088
18089	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18090
18091	SmallVector<int,8> ShufMask2(NumElems, -1);
18092	for (unsigned i = 0; i != NumElems/2; ++i)
18093	ShufMask2[i] = i + NumElems/2;
18094
18095	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18096
18097	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18098	VT.getVectorNumElements() / 2);
18099
18100	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18101	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18102
18103	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18104	}
18105
18106	// Lower truncating store. We need a special lowering to vXi1 vectors
18107	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18108	SelectionDAG &DAG) {
18109	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18110	SDLoc dl(St);
18111	EVT MemVT = St->getMemoryVT();
18112	assert(St->isTruncatingStore() && "We only custom truncating store.")((St->isTruncatingStore() && "We only custom truncating store." ) ? static_cast<void> (0) : __assert_fail ("St->isTruncatingStore() && \"We only custom truncating store.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18112, __PRETTY_FUNCTION__));
18113	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&((MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector") ? static_cast <void> (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18114, __PRETTY_FUNCTION__))
18114	"Expected truncstore of i1 vector")((MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector") ? static_cast <void> (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18114, __PRETTY_FUNCTION__));
18115
18116	SDValue Op = St->getValue();
18117	MVT OpVT = Op.getValueType().getSimpleVT();
18118	unsigned NumElts = OpVT.getVectorNumElements();
18119	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
18120	NumElts == 16) {
18121	// Truncate and store - everything is legal
18122	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18123	if (MemVT.getSizeInBits() < 8)
18124	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18125	DAG.getUNDEF(MVT::v8i1), Op,
18126	DAG.getIntPtrConstant(0, dl));
18127	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18128	St->getMemOperand());
18129	}
18130
18131	// A subset, assume that we have only AVX-512F
18132	if (NumElts <= 8) {
18133	if (NumElts < 8) {
18134	// Extend to 8-elts vector
18135	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18136	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18137	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18138	}
18139	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18140	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18141	St->getMemOperand());
18142	}
18143	// v32i8
18144	assert(OpVT == MVT::v32i8 && "Unexpected operand type")((OpVT == MVT::v32i8 && "Unexpected operand type") ? static_cast <void> (0) : __assert_fail ("OpVT == MVT::v32i8 && \"Unexpected operand type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18144, __PRETTY_FUNCTION__));
18145	// Divide the vector into 2 parts and store each part separately
18146	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18147	DAG.getIntPtrConstant(0, dl));
18148	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18149	SDValue BasePtr = St->getBasePtr();
18150	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18151	St->getMemOperand());
18152	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18153	DAG.getIntPtrConstant(16, dl));
18154	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18155
18156	SDValue BasePtrHi =
18157	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18158	DAG.getConstant(2, dl, BasePtr.getValueType()));
18159
18160	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18161	BasePtrHi, St->getMemOperand());
18162	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18163	}
18164
18165	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18166	const X86Subtarget &Subtarget,
18167	SelectionDAG &DAG) {
18168
18169	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18170	SDLoc dl(Ld);
18171	EVT MemVT = Ld->getMemoryVT();
18172	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&((MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && "Expected i1 vector load") ? static_cast<void> (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18173, __PRETTY_FUNCTION__))
18173	"Expected i1 vector load")((MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && "Expected i1 vector load") ? static_cast<void> (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18173, __PRETTY_FUNCTION__));
18174	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18175	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18176	MVT VT = Op.getValueType().getSimpleVT();
18177	unsigned NumElts = VT.getVectorNumElements();
18178
18179	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
18180	(Subtarget.hasDQI() && NumElts < 16) \|\|
18181	NumElts == 16) {
18182	// Load and extend - everything is legal
18183	if (NumElts < 8) {
18184	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18185	Ld->getBasePtr(),
18186	Ld->getMemOperand());
18187	// Replace chain users with the new chain.
18188	assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!" ) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18188, __PRETTY_FUNCTION__));
18189	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18190	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18191	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18192
18193	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18194	DAG.getIntPtrConstant(0, dl));
18195	}
18196	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18197	Ld->getBasePtr(),
18198	Ld->getMemOperand());
18199	// Replace chain users with the new chain.
18200	assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!" ) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18200, __PRETTY_FUNCTION__));
18201	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18202
18203	// Finally, do a normal sign-extend to the desired register.
18204	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18205	}
18206
18207	if (NumElts <= 8) {
18208	// A subset, assume that we have only AVX-512F
18209	unsigned NumBitsToLoad = 8;
18210	MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18211	SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18212	Ld->getBasePtr(),
18213	Ld->getMemOperand());
18214	// Replace chain users with the new chain.
18215	assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!" ) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18215, __PRETTY_FUNCTION__));
18216	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18217
18218	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18219	SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18220
18221	if (NumElts == 8)
18222	return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18223
18224	// we should take care to v4i1 and v2i1
18225
18226	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18227	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18228	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18229	DAG.getIntPtrConstant(0, dl));
18230	}
18231
18232	assert(VT == MVT::v32i8 && "Unexpected extload type")((VT == MVT::v32i8 && "Unexpected extload type") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected extload type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18232, __PRETTY_FUNCTION__));
18233
18234	SmallVector<SDValue, 2> Chains;
18235
18236	SDValue BasePtr = Ld->getBasePtr();
18237	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18238	Ld->getBasePtr(),
18239	Ld->getMemOperand());
18240	Chains.push_back(LoadLo.getValue(1));
18241
18242	SDValue BasePtrHi =
18243	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18244	DAG.getConstant(2, dl, BasePtr.getValueType()));
18245
18246	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18247	BasePtrHi,
18248	Ld->getMemOperand());
18249	Chains.push_back(LoadHi.getValue(1));
18250	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18251	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18252
18253	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18254	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18255	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18256	}
18257
18258	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
18259	// may emit an illegal shuffle but the expansion is still better than scalar
18260	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18261	// we'll emit a shuffle and a arithmetic shift.
18262	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18263	// TODO: It is possible to support ZExt by zeroing the undef values during
18264	// the shuffle phase or after the shuffle.
18265	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18266	SelectionDAG &DAG) {
18267	MVT RegVT = Op.getSimpleValueType();
18268	assert(RegVT.isVector() && "We only custom lower vector sext loads.")((RegVT.isVector() && "We only custom lower vector sext loads." ) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector sext loads.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18268, __PRETTY_FUNCTION__));
18269	assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector sext loads." ) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18270, __PRETTY_FUNCTION__))
18270	"We only custom lower integer vector sext loads.")((RegVT.isInteger() && "We only custom lower integer vector sext loads." ) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18270, __PRETTY_FUNCTION__));
18271
18272	// Nothing useful we can do without SSE2 shuffles.
18273	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.")((Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2." ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"We only custom lower sext loads with SSE2.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18273, __PRETTY_FUNCTION__));
18274
18275	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18276	SDLoc dl(Ld);
18277	EVT MemVT = Ld->getMemoryVT();
18278	if (MemVT.getScalarType() == MVT::i1)
18279	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18280
18281	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18282	unsigned RegSz = RegVT.getSizeInBits();
18283
18284	ISD::LoadExtType Ext = Ld->getExtensionType();
18285
18286	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)(((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented." ) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18287, __PRETTY_FUNCTION__))
18287	&& "Only anyext and sext are currently implemented.")(((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented." ) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18287, __PRETTY_FUNCTION__));
18288	assert(MemVT != RegVT && "Cannot extend to the same type")((MemVT != RegVT && "Cannot extend to the same type") ? static_cast<void> (0) : __assert_fail ("MemVT != RegVT && \"Cannot extend to the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18288, __PRETTY_FUNCTION__));
18289	assert(MemVT.isVector() && "Must load a vector from memory")((MemVT.isVector() && "Must load a vector from memory" ) ? static_cast<void> (0) : __assert_fail ("MemVT.isVector() && \"Must load a vector from memory\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18289, __PRETTY_FUNCTION__));
18290
18291	unsigned NumElems = RegVT.getVectorNumElements();
18292	unsigned MemSz = MemVT.getSizeInBits();
18293	assert(RegSz > MemSz && "Register size must be greater than the mem size")((RegSz > MemSz && "Register size must be greater than the mem size" ) ? static_cast<void> (0) : __assert_fail ("RegSz > MemSz && \"Register size must be greater than the mem size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18293, __PRETTY_FUNCTION__));
18294
18295	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18296	// The only way in which we have a legal 256-bit vector result but not the
18297	// integer 256-bit operations needed to directly lower a sextload is if we
18298	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
18299	// a 128-bit vector and a normal sign_extend to 256-bits that should get
18300	// correctly legalized. We do this late to allow the canonical form of
18301	// sextload to persist throughout the rest of the DAG combiner -- it wants
18302	// to fold together any extensions it can, and so will fuse a sign_extend
18303	// of an sextload into a sextload targeting a wider value.
18304	SDValue Load;
18305	if (MemSz == 128) {
18306	// Just switch this to a normal load.
18307	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!") ? static_cast< void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18309, __PRETTY_FUNCTION__))
18308	"it must be a legal 128-bit vector "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!") ? static_cast< void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18309, __PRETTY_FUNCTION__))
18309	"type!")((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!") ? static_cast< void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18309, __PRETTY_FUNCTION__));
18310	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18311	Ld->getPointerInfo(), Ld->getAlignment(),
18312	Ld->getMemOperand()->getFlags());
18313	} else {
18314	assert(MemSz < 128 &&((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!" ) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18315, __PRETTY_FUNCTION__))
18315	"Can't extend a type wider than 128 bits to a 256 bit vector!")((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!" ) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18315, __PRETTY_FUNCTION__));
18316	// Do an sext load to a 128-bit vector type. We want to use the same
18317	// number of elements, but elements half as wide. This will end up being
18318	// recursively lowered by this routine, but will succeed as we definitely
18319	// have all the necessary features if we're using AVX1.
18320	EVT HalfEltVT =
18321	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18322	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18323	Load =
18324	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18325	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18326	Ld->getMemOperand()->getFlags());
18327	}
18328
18329	// Replace chain users with the new chain.
18330	assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!" ) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18330, __PRETTY_FUNCTION__));
18331	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18332
18333	// Finally, do a normal sign-extend to the desired register.
18334	return DAG.getSExtOrTrunc(Load, dl, RegVT);
18335	}
18336
18337	// All sizes must be a power of two.
18338	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18339, __PRETTY_FUNCTION__))
18339	"Non-power-of-two elements are not custom lowered!")((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18339, __PRETTY_FUNCTION__));
18340
18341	// Attempt to load the original value using scalar loads.
18342	// Find the largest scalar type that divides the total loaded size.
18343	MVT SclrLoadTy = MVT::i8;
18344	for (MVT Tp : MVT::integer_valuetypes()) {
18345	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18346	SclrLoadTy = Tp;
18347	}
18348	}
18349
18350	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18351	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18352	(64 <= MemSz))
18353	SclrLoadTy = MVT::f64;
18354
18355	// Calculate the number of scalar loads that we need to perform
18356	// in order to load our vector from memory.
18357	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18358
18359	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&(((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) && "Can only lower sext loads with a single scalar load!" ) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD \|\| NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18360, __PRETTY_FUNCTION__))
18360	"Can only lower sext loads with a single scalar load!")(((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) && "Can only lower sext loads with a single scalar load!" ) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD \|\| NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18360, __PRETTY_FUNCTION__));
18361
18362	unsigned loadRegZize = RegSz;
18363	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18364	loadRegZize = 128;
18365
18366	// Represent our vector as a sequence of elements which are the
18367	// largest scalar that we can load.
18368	EVT LoadUnitVecVT = EVT::getVectorVT(
18369	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18370
18371	// Represent the data using the same element type that is stored in
18372	// memory. In practice, we ''widen'' MemVT.
18373	EVT WideVecVT =
18374	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18375	loadRegZize / MemVT.getScalarSizeInBits());
18376
18377	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type") ? static_cast<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18378, __PRETTY_FUNCTION__))
18378	"Invalid vector type")((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type") ? static_cast<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18378, __PRETTY_FUNCTION__));
18379
18380	// We can't shuffle using an illegal type.
18381	assert(TLI.isTypeLegal(WideVecVT) &&((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types" ) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18382, __PRETTY_FUNCTION__))
18382	"We only lower types that form legal widened vector types")((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types" ) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18382, __PRETTY_FUNCTION__));
18383
18384	SmallVector<SDValue, 8> Chains;
18385	SDValue Ptr = Ld->getBasePtr();
18386	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18387	TLI.getPointerTy(DAG.getDataLayout()));
18388	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18389
18390	for (unsigned i = 0; i < NumLoads; ++i) {
18391	// Perform a single load.
18392	SDValue ScalarLoad =
18393	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18394	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18395	Chains.push_back(ScalarLoad.getValue(1));
18396	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
18397	// another round of DAGCombining.
18398	if (i == 0)
18399	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18400	else
18401	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18402	ScalarLoad, DAG.getIntPtrConstant(i, dl));
18403
18404	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18405	}
18406
18407	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18408
18409	// Bitcast the loaded value to a vector of the original element type, in
18410	// the size of the target vector type.
18411	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18412	unsigned SizeRatio = RegSz / MemSz;
18413
18414	if (Ext == ISD::SEXTLOAD) {
18415	// If we have SSE4.1, we can directly emit a VSEXT node.
18416	if (Subtarget.hasSSE41()) {
18417	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18418	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18419	return Sext;
18420	}
18421
18422	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18423	// lanes.
18424	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&((TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!" ) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18425, __PRETTY_FUNCTION__))
18425	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!")((TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!" ) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18425, __PRETTY_FUNCTION__));
18426
18427	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18428	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18429	return Shuff;
18430	}
18431
18432	// Redistribute the loaded elements into the different locations.
18433	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18434	for (unsigned i = 0; i != NumElems; ++i)
18435	ShuffleVec[i * SizeRatio] = i;
18436
18437	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18438	DAG.getUNDEF(WideVecVT), ShuffleVec);
18439
18440	// Bitcast to the requested type.
18441	Shuff = DAG.getBitcast(RegVT, Shuff);
18442	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18443	return Shuff;
18444	}
18445
18446	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18447	/// each of which has no other use apart from the AND / OR.
18448	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18449	Opc = Op.getOpcode();
18450	if (Opc != ISD::OR && Opc != ISD::AND)
18451	return false;
18452	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18453	Op.getOperand(0).hasOneUse() &&
18454	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18455	Op.getOperand(1).hasOneUse());
18456	}
18457
18458	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18459	/// SETCC node has a single use.
18460	static bool isXor1OfSetCC(SDValue Op) {
18461	if (Op.getOpcode() != ISD::XOR)
18462	return false;
18463	if (isOneConstant(Op.getOperand(1)))
18464	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18465	Op.getOperand(0).hasOneUse();
18466	return false;
18467	}
18468
18469	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18470	bool addTest = true;
18471	SDValue Chain = Op.getOperand(0);
18472	SDValue Cond = Op.getOperand(1);
18473	SDValue Dest = Op.getOperand(2);
18474	SDLoc dl(Op);
18475	SDValue CC;
18476	bool Inverted = false;
18477
18478	if (Cond.getOpcode() == ISD::SETCC) {
18479	// Check for setcc([su]{add,sub,mul}o == 0).
18480	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18481	isNullConstant(Cond.getOperand(1)) &&
18482	Cond.getOperand(0).getResNo() == 1 &&
18483	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
18484	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
18485	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
18486	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
18487	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
18488	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18489	Inverted = true;
18490	Cond = Cond.getOperand(0);
18491	} else {
18492	if (SDValue NewCond = LowerSETCC(Cond, DAG))
18493	Cond = NewCond;
18494	}
18495	}
18496	#if 0
18497	// FIXME: LowerXALUO doesn't handle these!!
18498	else if (Cond.getOpcode() == X86ISD::ADD \|\|
18499	Cond.getOpcode() == X86ISD::SUB \|\|
18500	Cond.getOpcode() == X86ISD::SMUL \|\|
18501	Cond.getOpcode() == X86ISD::UMUL)
18502	Cond = LowerXALUO(Cond, DAG);
18503	#endif
18504
18505	// Look pass (and (setcc_carry (cmp ...)), 1).
18506	if (Cond.getOpcode() == ISD::AND &&
18507	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18508	isOneConstant(Cond.getOperand(1)))
18509	Cond = Cond.getOperand(0);
18510
18511	// If condition flag is set by a X86ISD::CMP, then use it as the condition
18512	// setting operand in place of the X86ISD::SETCC.
18513	unsigned CondOpcode = Cond.getOpcode();
18514	if (CondOpcode == X86ISD::SETCC \|\|
18515	CondOpcode == X86ISD::SETCC_CARRY) {
18516	CC = Cond.getOperand(0);
18517
18518	SDValue Cmp = Cond.getOperand(1);
18519	unsigned Opc = Cmp.getOpcode();
18520	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18521	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
18522	Cond = Cmp;
18523	addTest = false;
18524	} else {
18525	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18526	default: break;
18527	case X86::COND_O:
18528	case X86::COND_B:
18529	// These can only come from an arithmetic instruction with overflow,
18530	// e.g. SADDO, UADDO.
18531	Cond = Cond.getOperand(1);
18532	addTest = false;
18533	break;
18534	}
18535	}
18536	}
18537	CondOpcode = Cond.getOpcode();
18538	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
18539	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
18540	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
18541	Cond.getOperand(0).getValueType() != MVT::i8)) {
18542	SDValue LHS = Cond.getOperand(0);
18543	SDValue RHS = Cond.getOperand(1);
18544	unsigned X86Opcode;
18545	unsigned X86Cond;
18546	SDVTList VTs;
18547	// Keep this in sync with LowerXALUO, otherwise we might create redundant
18548	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18549	// X86ISD::INC).
18550	switch (CondOpcode) {
18551	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18552	case ISD::SADDO:
18553	if (isOneConstant(RHS)) {
18554	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18555	break;
18556	}
18557	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18558	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18559	case ISD::SSUBO:
18560	if (isOneConstant(RHS)) {
18561	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18562	break;
18563	}
18564	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18565	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18566	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18567	default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18567);
18568	}
18569	if (Inverted)
18570	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18571	if (CondOpcode == ISD::UMULO)
18572	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18573	MVT::i32);
18574	else
18575	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18576
18577	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18578
18579	if (CondOpcode == ISD::UMULO)
18580	Cond = X86Op.getValue(2);
18581	else
18582	Cond = X86Op.getValue(1);
18583
18584	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18585	addTest = false;
18586	} else {
18587	unsigned CondOpc;
18588	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18589	SDValue Cmp = Cond.getOperand(0).getOperand(1);
18590	if (CondOpc == ISD::OR) {
18591	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
18592	// two branches instead of an explicit OR instruction with a
18593	// separate test.
18594	if (Cmp == Cond.getOperand(1).getOperand(1) &&
18595	isX86LogicalCmp(Cmp)) {
18596	CC = Cond.getOperand(0).getOperand(0);
18597	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18598	Chain, Dest, CC, Cmp);
18599	CC = Cond.getOperand(1).getOperand(0);
18600	Cond = Cmp;
18601	addTest = false;
18602	}
18603	} else { // ISD::AND
18604	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18605	// two branches instead of an explicit AND instruction with a
18606	// separate test. However, we only do this if this block doesn't
18607	// have a fall-through edge, because this requires an explicit
18608	// jmp when the condition is false.
18609	if (Cmp == Cond.getOperand(1).getOperand(1) &&
18610	isX86LogicalCmp(Cmp) &&
18611	Op.getNode()->hasOneUse()) {
18612	X86::CondCode CCode =
18613	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18614	CCode = X86::GetOppositeBranchCondition(CCode);
18615	CC = DAG.getConstant(CCode, dl, MVT::i8);
18616	SDNode User = Op.getNode()->use_begin();
18617	// Look for an unconditional branch following this conditional branch.
18618	// We need this because we need to reverse the successors in order
18619	// to implement FCMP_OEQ.
18620	if (User->getOpcode() == ISD::BR) {
18621	SDValue FalseBB = User->getOperand(1);
18622	SDNode *NewBR =
18623	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18624	assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail ("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18624, __PRETTY_FUNCTION__));
18625	(void)NewBR;
18626	Dest = FalseBB;
18627
18628	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18629	Chain, Dest, CC, Cmp);
18630	X86::CondCode CCode =
18631	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18632	CCode = X86::GetOppositeBranchCondition(CCode);
18633	CC = DAG.getConstant(CCode, dl, MVT::i8);
18634	Cond = Cmp;
18635	addTest = false;
18636	}
18637	}
18638	}
18639	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18640	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18641	// It should be transformed during dag combiner except when the condition
18642	// is set by a arithmetics with overflow node.
18643	X86::CondCode CCode =
18644	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18645	CCode = X86::GetOppositeBranchCondition(CCode);
18646	CC = DAG.getConstant(CCode, dl, MVT::i8);
18647	Cond = Cond.getOperand(0).getOperand(1);
18648	addTest = false;
18649	} else if (Cond.getOpcode() == ISD::SETCC &&
18650	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18651	// For FCMP_OEQ, we can emit
18652	// two branches instead of an explicit AND instruction with a
18653	// separate test. However, we only do this if this block doesn't
18654	// have a fall-through edge, because this requires an explicit
18655	// jmp when the condition is false.
18656	if (Op.getNode()->hasOneUse()) {
18657	SDNode User = Op.getNode()->use_begin();
18658	// Look for an unconditional branch following this conditional branch.
18659	// We need this because we need to reverse the successors in order
18660	// to implement FCMP_OEQ.
18661	if (User->getOpcode() == ISD::BR) {
18662	SDValue FalseBB = User->getOperand(1);
18663	SDNode *NewBR =
18664	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18665	assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail ("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18665, __PRETTY_FUNCTION__));
18666	(void)NewBR;
18667	Dest = FalseBB;
18668
18669	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18670	Cond.getOperand(0), Cond.getOperand(1));
18671	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18672	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18673	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18674	Chain, Dest, CC, Cmp);
18675	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18676	Cond = Cmp;
18677	addTest = false;
18678	}
18679	}
18680	} else if (Cond.getOpcode() == ISD::SETCC &&
18681	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18682	// For FCMP_UNE, we can emit
18683	// two branches instead of an explicit AND instruction with a
18684	// separate test. However, we only do this if this block doesn't
18685	// have a fall-through edge, because this requires an explicit
18686	// jmp when the condition is false.
18687	if (Op.getNode()->hasOneUse()) {
18688	SDNode User = Op.getNode()->use_begin();
18689	// Look for an unconditional branch following this conditional branch.
18690	// We need this because we need to reverse the successors in order
18691	// to implement FCMP_UNE.
18692	if (User->getOpcode() == ISD::BR) {
18693	SDValue FalseBB = User->getOperand(1);
18694	SDNode *NewBR =
18695	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18696	assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail ("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18696, __PRETTY_FUNCTION__));
18697	(void)NewBR;
18698
18699	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18700	Cond.getOperand(0), Cond.getOperand(1));
18701	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18702	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18703	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18704	Chain, Dest, CC, Cmp);
18705	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18706	Cond = Cmp;
18707	addTest = false;
18708	Dest = FalseBB;
18709	}
18710	}
18711	}
18712	}
18713
18714	if (addTest) {
18715	// Look pass the truncate if the high bits are known zero.
18716	if (isTruncWithZeroHighBitsInput(Cond, DAG))
18717	Cond = Cond.getOperand(0);
18718
18719	// We know the result is compared against zero. Try to match it to BT.
18720	if (Cond.hasOneUse()) {
18721	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18722	CC = NewSetCC.getOperand(0);
18723	Cond = NewSetCC.getOperand(1);
18724	addTest = false;
18725	}
18726	}
18727	}
18728
18729	if (addTest) {
18730	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18731	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18732	Cond = EmitTest(Cond, X86Cond, dl, DAG);
18733	}
18734	Cond = ConvertCmpIfNecessary(Cond, DAG);
18735	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18736	Chain, Dest, CC, Cond);
18737	}
18738
18739	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18740	// Calls to _alloca are needed to probe the stack when allocating more than 4k
18741	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
18742	// that the guard pages used by the OS virtual memory manager are allocated in
18743	// correct sequence.
18744	SDValue
18745	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18746	SelectionDAG &DAG) const {
18747	MachineFunction &MF = DAG.getMachineFunction();
18748	bool SplitStack = MF.shouldSplitStack();
18749	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18750	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
18751	SplitStack \|\| EmitStackProbe;
18752	SDLoc dl(Op);
18753
18754	// Get the inputs.
18755	SDNode *Node = Op.getNode();
18756	SDValue Chain = Op.getOperand(0);
18757	SDValue Size = Op.getOperand(1);
18758	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18759	EVT VT = Node->getValueType(0);
18760
18761	// Chain the dynamic stack allocation so that it doesn't modify the stack
18762	// pointer when other instructions are using the stack.
18763	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18764
18765	bool Is64Bit = Subtarget.is64Bit();
18766	MVT SPTy = getPointerTy(DAG.getDataLayout());
18767
18768	SDValue Result;
18769	if (!Lower) {
18770	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18771	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18772	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!") ? static_cast <void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18773, __PRETTY_FUNCTION__))
18773	" not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!") ? static_cast <void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18773, __PRETTY_FUNCTION__));
18774
18775	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18776	Chain = SP.getValue(1);
18777	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18778	unsigned StackAlign = TFI.getStackAlignment();
18779	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18780	if (Align > StackAlign)
18781	Result = DAG.getNode(ISD::AND, dl, VT, Result,
18782	DAG.getConstant(-(uint64_t)Align, dl, VT));
18783	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18784	} else if (SplitStack) {
18785	MachineRegisterInfo &MRI = MF.getRegInfo();
18786
18787	if (Is64Bit) {
18788	// The 64 bit implementation of segmented stacks needs to clobber both r10
18789	// r11. This makes it impossible to use it along with nested parameters.
18790	const Function *F = MF.getFunction();
18791	for (const auto &A : F->args()) {
18792	if (A.hasNestAttr())
18793	report_fatal_error("Cannot use segmented stacks with functions that "
18794	"have nested arguments.");
18795	}
18796	}
18797
18798	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18799	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18800	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18801	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18802	DAG.getRegister(Vreg, SPTy));
18803	} else {
18804	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18805	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18806	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18807
18808	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18809	unsigned SPReg = RegInfo->getStackRegister();
18810	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18811	Chain = SP.getValue(1);
18812
18813	if (Align) {
18814	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18815	DAG.getConstant(-(uint64_t)Align, dl, VT));
18816	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18817	}
18818
18819	Result = SP;
18820	}
18821
18822	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18823	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18824
18825	SDValue Ops[2] = {Result, Chain};
18826	return DAG.getMergeValues(Ops, dl);
18827	}
18828
18829	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18830	MachineFunction &MF = DAG.getMachineFunction();
18831	auto PtrVT = getPointerTy(MF.getDataLayout());
18832	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18833
18834	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18835	SDLoc DL(Op);
18836
18837	if (!Subtarget.is64Bit() \|\|
18838	Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18839	// vastart just stores the address of the VarArgsFrameIndex slot into the
18840	// memory location argument.
18841	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18842	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18843	MachinePointerInfo(SV));
18844	}
18845
18846	// __va_list_tag:
18847	// gp_offset (0 - 6 * 8)
18848	// fp_offset (48 - 48 + 8 * 16)
18849	// overflow_arg_area (point to parameters coming in memory).
18850	// reg_save_area
18851	SmallVector<SDValue, 8> MemOps;
18852	SDValue FIN = Op.getOperand(1);
18853	// Store gp_offset
18854	SDValue Store = DAG.getStore(
18855	Op.getOperand(0), DL,
18856	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18857	MachinePointerInfo(SV));
18858	MemOps.push_back(Store);
18859
18860	// Store fp_offset
18861	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18862	Store = DAG.getStore(
18863	Op.getOperand(0), DL,
18864	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18865	MachinePointerInfo(SV, 4));
18866	MemOps.push_back(Store);
18867
18868	// Store ptr to overflow_arg_area
18869	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18870	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18871	Store =
18872	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18873	MemOps.push_back(Store);
18874
18875	// Store ptr to reg_save_area.
18876	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18877	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18878	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18879	Store = DAG.getStore(
18880	Op.getOperand(0), DL, RSFIN, FIN,
18881	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18882	MemOps.push_back(Store);
18883	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18884	}
18885
18886	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18887	assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18888, __PRETTY_FUNCTION__))
18888	"LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18888, __PRETTY_FUNCTION__));
18889	assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 4", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18889, __PRETTY_FUNCTION__));
18890
18891	MachineFunction &MF = DAG.getMachineFunction();
18892	if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18893	// The Win64 ABI uses char* instead of a structure.
18894	return DAG.expandVAArg(Op.getNode());
18895
18896	SDValue Chain = Op.getOperand(0);
18897	SDValue SrcPtr = Op.getOperand(1);
18898	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18899	unsigned Align = Op.getConstantOperandVal(3);
18900	SDLoc dl(Op);
18901
18902	EVT ArgVT = Op.getNode()->getValueType(0);
18903	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
18904	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18905	uint8_t ArgMode;
18906
18907	// Decide which area this value should be read from.
18908	// TODO: Implement the AMD64 ABI in its entirety. This simple
18909	// selection mechanism works only for the basic types.
18910	if (ArgVT == MVT::f80) {
18911	llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18911);
18912	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
18913	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18914	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
18915	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18916	} else {
18917	llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18917);
18918	}
18919
18920	if (ArgMode == 2) {
18921	// Sanity Check: Make sure using fp_offset makes sense.
18922	assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction()-> hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget .hasSSE1()) ? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18924, __PRETTY_FUNCTION__))
18923	!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction()-> hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget .hasSSE1()) ? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18924, __PRETTY_FUNCTION__))
18924	Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction()-> hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget .hasSSE1()) ? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18924, __PRETTY_FUNCTION__));
18925	}
18926
18927	// Insert VAARG_64 node into the DAG
18928	// VAARG_64 returns two values: Variable Argument Address, Chain
18929	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18930	DAG.getConstant(ArgMode, dl, MVT::i8),
18931	DAG.getConstant(Align, dl, MVT::i32)};
18932	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18933	SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18934	VTs, InstOps, MVT::i64,
18935	MachinePointerInfo(SV),
18936	/Align=/0,
18937	/Volatile=/false,
18938	/ReadMem=/true,
18939	/WriteMem=/true);
18940	Chain = VAARG.getValue(1);
18941
18942	// Load the next argument and return it
18943	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18944	}
18945
18946	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18947	SelectionDAG &DAG) {
18948	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
18949	// where a va_list is still an i8*.
18950	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18950, __PRETTY_FUNCTION__));
18951	if (Subtarget.isCallingConvWin64(
18952	DAG.getMachineFunction().getFunction()->getCallingConv()))
18953	// Probably a Win64 va_copy.
18954	return DAG.expandVACopy(Op.getNode());
18955
18956	SDValue Chain = Op.getOperand(0);
18957	SDValue DstPtr = Op.getOperand(1);
18958	SDValue SrcPtr = Op.getOperand(2);
18959	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18960	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18961	SDLoc DL(Op);
18962
18963	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18964	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
18965	false, false,
18966	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18967	}
18968
18969	/// Handle vector element shifts where the shift amount is a constant.
18970	/// Takes immediate version of shift as input.
18971	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18972	SDValue SrcOp, uint64_t ShiftAmt,
18973	SelectionDAG &DAG) {
18974	MVT ElementType = VT.getVectorElementType();
18975
18976	// Bitcast the source vector to the output type, this is mainly necessary for
18977	// vXi8/vXi64 shifts.
18978	if (VT != SrcOp.getSimpleValueType())
18979	SrcOp = DAG.getBitcast(VT, SrcOp);
18980
18981	// Fold this packed shift into its first operand if ShiftAmt is 0.
18982	if (ShiftAmt == 0)
18983	return SrcOp;
18984
18985	// Check for ShiftAmt >= element width
18986	if (ShiftAmt >= ElementType.getSizeInBits()) {
18987	if (Opc == X86ISD::VSRAI)
18988	ShiftAmt = ElementType.getSizeInBits() - 1;
18989	else
18990	return DAG.getConstant(0, dl, VT);
18991	}
18992
18993	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD ::VSRAI) && "Unknown target vector shift-by-constant node" ) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18994, __PRETTY_FUNCTION__))
18994	&& "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD ::VSRAI) && "Unknown target vector shift-by-constant node" ) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 18994, __PRETTY_FUNCTION__));
18995
18996	// Fold this packed vector shift into a build vector if SrcOp is a
18997	// vector of Constants or UNDEFs.
18998	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18999	SmallVector<SDValue, 8> Elts;
19000	unsigned NumElts = SrcOp->getNumOperands();
19001	ConstantSDNode *ND;
19002
19003	switch(Opc) {
19004	default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19004);
19005	case X86ISD::VSHLI:
19006	for (unsigned i=0; i!=NumElts; ++i) {
19007	SDValue CurrentOp = SrcOp->getOperand(i);
19008	if (CurrentOp->isUndef()) {
19009	Elts.push_back(CurrentOp);
19010	continue;
19011	}
19012	ND = cast<ConstantSDNode>(CurrentOp);
19013	const APInt &C = ND->getAPIntValue();
19014	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19015	}
19016	break;
19017	case X86ISD::VSRLI:
19018	for (unsigned i=0; i!=NumElts; ++i) {
19019	SDValue CurrentOp = SrcOp->getOperand(i);
19020	if (CurrentOp->isUndef()) {
19021	Elts.push_back(CurrentOp);
19022	continue;
19023	}
19024	ND = cast<ConstantSDNode>(CurrentOp);
19025	const APInt &C = ND->getAPIntValue();
19026	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19027	}
19028	break;
19029	case X86ISD::VSRAI:
19030	for (unsigned i=0; i!=NumElts; ++i) {
19031	SDValue CurrentOp = SrcOp->getOperand(i);
19032	if (CurrentOp->isUndef()) {
19033	Elts.push_back(CurrentOp);
19034	continue;
19035	}
19036	ND = cast<ConstantSDNode>(CurrentOp);
19037	const APInt &C = ND->getAPIntValue();
19038	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19039	}
19040	break;
19041	}
19042
19043	return DAG.getBuildVector(VT, dl, Elts);
19044	}
19045
19046	return DAG.getNode(Opc, dl, VT, SrcOp,
19047	DAG.getConstant(ShiftAmt, dl, MVT::i8));
19048	}
19049
19050	/// Handle vector element shifts where the shift amount may or may not be a
19051	/// constant. Takes immediate version of shift as input.
19052	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19053	SDValue SrcOp, SDValue ShAmt,
19054	const X86Subtarget &Subtarget,
19055	SelectionDAG &DAG) {
19056	MVT SVT = ShAmt.getSimpleValueType();
19057	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!" ) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 \|\| SVT == MVT::i64) && \"Unexpected value type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19057, __PRETTY_FUNCTION__));
19058
19059	// Catch shift-by-constant.
19060	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19061	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19062	CShAmt->getZExtValue(), DAG);
19063
19064	// Change opcode to non-immediate version
19065	switch (Opc) {
19066	default: llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19066);
19067	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19068	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19069	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19070	}
19071
19072	// Need to build a vector containing shift amount.
19073	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19074	// +=================+============+=======================================+
19075	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
19076	// +=================+============+=======================================+
19077	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
19078	// \| i32 \| Yes \| zero-extend in-reg \|
19079	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
19080	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
19081	// +=================+============+=======================================+
19082
19083	if (SVT == MVT::i64)
19084	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19085	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19086	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19087	ShAmt = ShAmt.getOperand(0);
19088	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19089	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19090	} else if (Subtarget.hasSSE41() &&
19091	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19092	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19093	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19094	} else {
19095	SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19096	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19097	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19098	}
19099
19100	// The return type has to be a 128-bit type with the same element
19101	// type as the input type.
19102	MVT EltVT = VT.getVectorElementType();
19103	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19104
19105	ShAmt = DAG.getBitcast(ShVT, ShAmt);
19106	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19107	}
19108
19109	/// \brief Return Mask with the necessary casting or extending
19110	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
19111	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19112	const X86Subtarget &Subtarget, SelectionDAG &DAG,
19113	const SDLoc &dl) {
19114
19115	if (isAllOnesConstant(Mask))
19116	return DAG.getTargetConstant(1, dl, MaskVT);
19117	if (X86::isZeroNode(Mask))
19118	return DAG.getTargetConstant(0, dl, MaskVT);
19119
19120	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19121	// Mask should be extended
19122	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19123	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19124	}
19125
19126	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19127	if (MaskVT == MVT::v64i1) {
19128	assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19128, __PRETTY_FUNCTION__));
19129	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
19130	SDValue Lo, Hi;
19131	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19132	DAG.getConstant(0, dl, MVT::i32));
19133	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19134	DAG.getConstant(1, dl, MVT::i32));
19135
19136	Lo = DAG.getBitcast(MVT::v32i1, Lo);
19137	Hi = DAG.getBitcast(MVT::v32i1, Hi);
19138
19139	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19140	} else {
19141	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
19142	// and bitcast.
19143	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19144	return DAG.getBitcast(MaskVT,
19145	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19146	}
19147
19148	} else {
19149	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19150	Mask.getSimpleValueType().getSizeInBits());
19151	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19152	// are extracted by EXTRACT_SUBVECTOR.
19153	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19154	DAG.getBitcast(BitcastVT, Mask),
19155	DAG.getIntPtrConstant(0, dl));
19156	}
19157	}
19158
19159	/// \brief Return (and \p Op, \p Mask) for compare instructions or
19160	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19161	/// necessary casting or extending for \p Mask when lowering masking intrinsics
19162	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19163	SDValue PreservedSrc,
19164	const X86Subtarget &Subtarget,
19165	SelectionDAG &DAG) {
19166	MVT VT = Op.getSimpleValueType();
19167	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19168	unsigned OpcodeSelect = ISD::VSELECT;
19169	SDLoc dl(Op);
19170
19171	if (isAllOnesConstant(Mask))
19172	return Op;
19173
19174	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19175
19176	switch (Op.getOpcode()) {
19177	default: break;
19178	case X86ISD::PCMPEQM:
19179	case X86ISD::PCMPGTM:
19180	case X86ISD::CMPM:
19181	case X86ISD::CMPMU:
19182	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19183	case X86ISD::VFPCLASS:
19184	case X86ISD::VFPCLASSS:
19185	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19186	case X86ISD::VTRUNC:
19187	case X86ISD::VTRUNCS:
19188	case X86ISD::VTRUNCUS:
19189	case X86ISD::CVTPS2PH:
19190	// We can't use ISD::VSELECT here because it is not always "Legal"
19191	// for the destination type. For example vpmovqb require only AVX512
19192	// and vselect that can operate on byte element type require BWI
19193	OpcodeSelect = X86ISD::SELECT;
19194	break;
19195	}
19196	if (PreservedSrc.isUndef())
19197	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19198	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19199	}
19200
19201	/// \brief Creates an SDNode for a predicated scalar operation.
19202	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19203	/// The mask is coming as MVT::i8 and it should be transformed
19204	/// to MVT::v1i1 while lowering masking intrinsics.
19205	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19206	/// "X86select" instead of "vselect". We just can't create the "vselect" node
19207	/// for a scalar instruction.
19208	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19209	SDValue PreservedSrc,
19210	const X86Subtarget &Subtarget,
19211	SelectionDAG &DAG) {
19212
19213	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19214	if (MaskConst->getZExtValue() & 0x1)
19215	return Op;
19216
19217	MVT VT = Op.getSimpleValueType();
19218	SDLoc dl(Op);
19219
19220	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19221	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
19222	Op.getOpcode() == X86ISD::FSETCCM_RND)
19223	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19224	if (Op.getOpcode() == X86ISD::VFPCLASSS)
19225	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19226
19227	if (PreservedSrc.isUndef())
19228	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19229	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19230	}
19231
19232	static int getSEHRegistrationNodeSize(const Function *Fn) {
19233	if (!Fn->hasPersonalityFn())
19234	report_fatal_error(
19235	"querying registration node size for function without personality");
19236	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19237	// WinEHStatePass for the full struct definition.
19238	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19239	case EHPersonality::MSVC_X86SEH: return 24;
19240	case EHPersonality::MSVC_CXX: return 16;
19241	default: break;
19242	}
19243	report_fatal_error(
19244	"can only recover FP for 32-bit MSVC EH personality functions");
19245	}
19246
19247	/// When the MSVC runtime transfers control to us, either to an outlined
19248	/// function or when returning to a parent frame after catching an exception, we
19249	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19250	/// Here's the math:
19251	/// RegNodeBase = EntryEBP - RegNodeSize
19252	/// ParentFP = RegNodeBase - ParentFrameOffset
19253	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
19254	/// subtracting the offset (negative on x86) takes us back to the parent FP.
19255	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19256	SDValue EntryEBP) {
19257	MachineFunction &MF = DAG.getMachineFunction();
19258	SDLoc dl;
19259
19260	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19261	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19262
19263	// It's possible that the parent function no longer has a personality function
19264	// if the exceptional code was optimized away, in which case we just return
19265	// the incoming EBP.
19266	if (!Fn->hasPersonalityFn())
19267	return EntryEBP;
19268
19269	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19270	// registration, or the .set_setframe offset.
19271	MCSymbol *OffsetSym =
19272	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19273	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19274	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19275	SDValue ParentFrameOffset =
19276	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19277
19278	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19279	// prologue to RBP in the parent function.
19280	const X86Subtarget &Subtarget =
19281	static_cast<const X86Subtarget &>(DAG.getSubtarget());
19282	if (Subtarget.is64Bit())
19283	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19284
19285	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19286	// RegNodeBase = EntryEBP - RegNodeSize
19287	// ParentFP = RegNodeBase - ParentFrameOffset
19288	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19289	DAG.getConstant(RegNodeSize, dl, PtrVT));
19290	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19291	}
19292
19293	static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19294	SelectionDAG &DAG) {
19295	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
19296	auto isRoundModeCurDirection = [](SDValue Rnd) {
19297	if (!isa<ConstantSDNode>(Rnd))
19298	return false;
19299
19300	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19301	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19302	};
19303
19304	SDLoc dl(Op);
19305	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19306	MVT VT = Op.getSimpleValueType();
19307	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19308	if (IntrData) {
19309	switch(IntrData->Type) {
19310	case INTR_TYPE_1OP:
19311	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19312	case INTR_TYPE_2OP:
19313	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19314	Op.getOperand(2));
19315	case INTR_TYPE_3OP:
19316	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19317	Op.getOperand(2), Op.getOperand(3));
19318	case INTR_TYPE_4OP:
19319	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19320	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19321	case INTR_TYPE_1OP_MASK_RM: {
19322	SDValue Src = Op.getOperand(1);
19323	SDValue PassThru = Op.getOperand(2);
19324	SDValue Mask = Op.getOperand(3);
19325	SDValue RoundingMode;
19326	// We always add rounding mode to the Node.
19327	// If the rounding mode is not specified, we add the
19328	// "current direction" mode.
19329	if (Op.getNumOperands() == 4)
19330	RoundingMode =
19331	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19332	else
19333	RoundingMode = Op.getOperand(4);
19334	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!")((IntrData->Opc1 == 0 && "Unexpected second opcode!" ) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc1 == 0 && \"Unexpected second opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19334, __PRETTY_FUNCTION__));
19335	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19336	RoundingMode),
19337	Mask, PassThru, Subtarget, DAG);
19338	}
19339	case INTR_TYPE_1OP_MASK: {
19340	SDValue Src = Op.getOperand(1);
19341	SDValue PassThru = Op.getOperand(2);
19342	SDValue Mask = Op.getOperand(3);
19343	// We add rounding mode to the Node when
19344	// - RM Opcode is specified and
19345	// - RM is not "current direction".
19346	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19347	if (IntrWithRoundingModeOpcode != 0) {
19348	SDValue Rnd = Op.getOperand(4);
19349	if (!isRoundModeCurDirection(Rnd)) {
19350	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19351	dl, Op.getValueType(),
19352	Src, Rnd),
19353	Mask, PassThru, Subtarget, DAG);
19354	}
19355	}
19356	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19357	Mask, PassThru, Subtarget, DAG);
19358	}
19359	case INTR_TYPE_SCALAR_MASK: {
19360	SDValue Src1 = Op.getOperand(1);
19361	SDValue Src2 = Op.getOperand(2);
19362	SDValue passThru = Op.getOperand(3);
19363	SDValue Mask = Op.getOperand(4);
19364	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19365	if (IntrWithRoundingModeOpcode != 0) {
19366	SDValue Rnd = Op.getOperand(5);
19367	if (!isRoundModeCurDirection(Rnd))
19368	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19369	dl, VT, Src1, Src2, Rnd),
19370	Mask, passThru, Subtarget, DAG);
19371	}
19372	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19373	Mask, passThru, Subtarget, DAG);
19374	}
19375	case INTR_TYPE_SCALAR_MASK_RM: {
19376	SDValue Src1 = Op.getOperand(1);
19377	SDValue Src2 = Op.getOperand(2);
19378	SDValue Src0 = Op.getOperand(3);
19379	SDValue Mask = Op.getOperand(4);
19380	// There are 2 kinds of intrinsics in this group:
19381	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19382	// (2) With rounding mode and sae - 7 operands.
19383	if (Op.getNumOperands() == 6) {
19384	SDValue Sae = Op.getOperand(5);
19385	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19386	Sae),
19387	Mask, Src0, Subtarget, DAG);
19388	}
19389	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form")((Op.getNumOperands() == 7 && "Unexpected intrinsic form" ) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 7 && \"Unexpected intrinsic form\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19389, __PRETTY_FUNCTION__));
19390	SDValue RoundingMode = Op.getOperand(5);
19391	SDValue Sae = Op.getOperand(6);
19392	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19393	RoundingMode, Sae),
19394	Mask, Src0, Subtarget, DAG);
19395	}
19396	case INTR_TYPE_2OP_MASK:
19397	case INTR_TYPE_2OP_IMM8_MASK: {
19398	SDValue Src1 = Op.getOperand(1);
19399	SDValue Src2 = Op.getOperand(2);
19400	SDValue PassThru = Op.getOperand(3);
19401	SDValue Mask = Op.getOperand(4);
19402
19403	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19404	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19405
19406	// We specify 2 possible opcodes for intrinsics with rounding modes.
19407	// First, we check if the intrinsic may have non-default rounding mode,
19408	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
19409	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19410	if (IntrWithRoundingModeOpcode != 0) {
19411	SDValue Rnd = Op.getOperand(5);
19412	if (!isRoundModeCurDirection(Rnd)) {
19413	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19414	dl, Op.getValueType(),
19415	Src1, Src2, Rnd),
19416	Mask, PassThru, Subtarget, DAG);
19417	}
19418	}
19419	// TODO: Intrinsics should have fast-math-flags to propagate.
19420	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19421	Mask, PassThru, Subtarget, DAG);
19422	}
19423	case INTR_TYPE_2OP_MASK_RM: {
19424	SDValue Src1 = Op.getOperand(1);
19425	SDValue Src2 = Op.getOperand(2);
19426	SDValue PassThru = Op.getOperand(3);
19427	SDValue Mask = Op.getOperand(4);
19428	// We specify 2 possible modes for intrinsics, with/without rounding
19429	// modes.
19430	// First, we check if the intrinsic have rounding mode (6 operands),
19431	// if not, we set rounding mode to "current".
19432	SDValue Rnd;
19433	if (Op.getNumOperands() == 6)
19434	Rnd = Op.getOperand(5);
19435	else
19436	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19437	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19438	Src1, Src2, Rnd),
19439	Mask, PassThru, Subtarget, DAG);
19440	}
19441	case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19442	SDValue Src1 = Op.getOperand(1);
19443	SDValue Src2 = Op.getOperand(2);
19444	SDValue Src3 = Op.getOperand(3);
19445	SDValue PassThru = Op.getOperand(4);
19446	SDValue Mask = Op.getOperand(5);
19447	SDValue Sae = Op.getOperand(6);
19448
19449	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19450	Src2, Src3, Sae),
19451	Mask, PassThru, Subtarget, DAG);
19452	}
19453	case INTR_TYPE_3OP_MASK_RM: {
19454	SDValue Src1 = Op.getOperand(1);
19455	SDValue Src2 = Op.getOperand(2);
19456	SDValue Imm = Op.getOperand(3);
19457	SDValue PassThru = Op.getOperand(4);
19458	SDValue Mask = Op.getOperand(5);
19459	// We specify 2 possible modes for intrinsics, with/without rounding
19460	// modes.
19461	// First, we check if the intrinsic have rounding mode (7 operands),
19462	// if not, we set rounding mode to "current".
19463	SDValue Rnd;
19464	if (Op.getNumOperands() == 7)
19465	Rnd = Op.getOperand(6);
19466	else
19467	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19468	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19469	Src1, Src2, Imm, Rnd),
19470	Mask, PassThru, Subtarget, DAG);
19471	}
19472	case INTR_TYPE_3OP_IMM8_MASK:
19473	case INTR_TYPE_3OP_MASK: {
19474	SDValue Src1 = Op.getOperand(1);
19475	SDValue Src2 = Op.getOperand(2);
19476	SDValue Src3 = Op.getOperand(3);
19477	SDValue PassThru = Op.getOperand(4);
19478	SDValue Mask = Op.getOperand(5);
19479
19480	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19481	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19482
19483	// We specify 2 possible opcodes for intrinsics with rounding modes.
19484	// First, we check if the intrinsic may have non-default rounding mode,
19485	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
19486	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19487	if (IntrWithRoundingModeOpcode != 0) {
19488	SDValue Rnd = Op.getOperand(6);
19489	if (!isRoundModeCurDirection(Rnd)) {
19490	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19491	dl, Op.getValueType(),
19492	Src1, Src2, Src3, Rnd),
19493	Mask, PassThru, Subtarget, DAG);
19494	}
19495	}
19496	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19497	Src1, Src2, Src3),
19498	Mask, PassThru, Subtarget, DAG);
19499	}
19500	case VPERM_2OP_MASK : {
19501	SDValue Src1 = Op.getOperand(1);
19502	SDValue Src2 = Op.getOperand(2);
19503	SDValue PassThru = Op.getOperand(3);
19504	SDValue Mask = Op.getOperand(4);
19505
19506	// Swap Src1 and Src2 in the node creation
19507	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19508	Mask, PassThru, Subtarget, DAG);
19509	}
19510	case VPERM_3OP_MASKZ:
19511	case VPERM_3OP_MASK:{
19512	MVT VT = Op.getSimpleValueType();
19513	// Src2 is the PassThru
19514	SDValue Src1 = Op.getOperand(1);
19515	// PassThru needs to be the same type as the destination in order
19516	// to pattern match correctly.
19517	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19518	SDValue Src3 = Op.getOperand(3);
19519	SDValue Mask = Op.getOperand(4);
19520	SDValue PassThru = SDValue();
19521
19522	// set PassThru element
19523	if (IntrData->Type == VPERM_3OP_MASKZ)
19524	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19525	else
19526	PassThru = Src2;
19527
19528	// Swap Src1 and Src2 in the node creation
19529	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19530	dl, Op.getValueType(),
19531	Src2, Src1, Src3),
19532	Mask, PassThru, Subtarget, DAG);
19533	}
19534	case FMA_OP_MASK3:
19535	case FMA_OP_MASKZ:
19536	case FMA_OP_MASK: {
19537	SDValue Src1 = Op.getOperand(1);
19538	SDValue Src2 = Op.getOperand(2);
19539	SDValue Src3 = Op.getOperand(3);
19540	SDValue Mask = Op.getOperand(4);
19541	MVT VT = Op.getSimpleValueType();
19542	SDValue PassThru = SDValue();
19543
19544	// set PassThru element
19545	if (IntrData->Type == FMA_OP_MASKZ)
19546	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19547	else if (IntrData->Type == FMA_OP_MASK3)
19548	PassThru = Src3;
19549	else
19550	PassThru = Src1;
19551
19552	// We specify 2 possible opcodes for intrinsics with rounding modes.
19553	// First, we check if the intrinsic may have non-default rounding mode,
19554	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
19555	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19556	if (IntrWithRoundingModeOpcode != 0) {
19557	SDValue Rnd = Op.getOperand(5);
19558	if (!isRoundModeCurDirection(Rnd))
19559	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19560	dl, Op.getValueType(),
19561	Src1, Src2, Src3, Rnd),
19562	Mask, PassThru, Subtarget, DAG);
19563	}
19564	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19565	dl, Op.getValueType(),
19566	Src1, Src2, Src3),
19567	Mask, PassThru, Subtarget, DAG);
19568	}
19569	case FMA_OP_SCALAR_MASK:
19570	case FMA_OP_SCALAR_MASK3:
19571	case FMA_OP_SCALAR_MASKZ: {
19572	SDValue Src1 = Op.getOperand(1);
19573	SDValue Src2 = Op.getOperand(2);
19574	SDValue Src3 = Op.getOperand(3);
19575	SDValue Mask = Op.getOperand(4);
19576	MVT VT = Op.getSimpleValueType();
19577	SDValue PassThru = SDValue();
19578
19579	// set PassThru element
19580	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19581	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19582	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19583	PassThru = Src3;
19584	else
19585	PassThru = Src1;
19586
19587	SDValue Rnd = Op.getOperand(5);
19588	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19589	Op.getValueType(), Src1, Src2,
19590	Src3, Rnd),
19591	Mask, PassThru, Subtarget, DAG);
19592	}
19593	case TERLOG_OP_MASK:
19594	case TERLOG_OP_MASKZ: {
19595	SDValue Src1 = Op.getOperand(1);
19596	SDValue Src2 = Op.getOperand(2);
19597	SDValue Src3 = Op.getOperand(3);
19598	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19599	SDValue Mask = Op.getOperand(5);
19600	MVT VT = Op.getSimpleValueType();
19601	SDValue PassThru = Src1;
19602	// Set PassThru element.
19603	if (IntrData->Type == TERLOG_OP_MASKZ)
19604	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19605
19606	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19607	Src1, Src2, Src3, Src4),
19608	Mask, PassThru, Subtarget, DAG);
19609	}
19610	case CVTPD2PS:
19611	// ISD::FP_ROUND has a second argument that indicates if the truncation
19612	// does not change the value. Set it to 0 since it can change.
19613	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19614	DAG.getIntPtrConstant(0, dl));
19615	case CVTPD2PS_MASK: {
19616	SDValue Src = Op.getOperand(1);
19617	SDValue PassThru = Op.getOperand(2);
19618	SDValue Mask = Op.getOperand(3);
19619	// We add rounding mode to the Node when
19620	// - RM Opcode is specified and
19621	// - RM is not "current direction".
19622	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19623	if (IntrWithRoundingModeOpcode != 0) {
19624	SDValue Rnd = Op.getOperand(4);
19625	if (!isRoundModeCurDirection(Rnd)) {
19626	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19627	dl, Op.getValueType(),
19628	Src, Rnd),
19629	Mask, PassThru, Subtarget, DAG);
19630	}
19631	}
19632	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!")((IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!" ) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == ISD::FP_ROUND && \"Unexpected opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19632, __PRETTY_FUNCTION__));
19633	// ISD::FP_ROUND has a second argument that indicates if the truncation
19634	// does not change the value. Set it to 0 since it can change.
19635	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19636	DAG.getIntPtrConstant(0, dl)),
19637	Mask, PassThru, Subtarget, DAG);
19638	}
19639	case FPCLASS: {
19640	// FPclass intrinsics with mask
19641	SDValue Src1 = Op.getOperand(1);
19642	MVT VT = Src1.getSimpleValueType();
19643	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19644	SDValue Imm = Op.getOperand(2);
19645	SDValue Mask = Op.getOperand(3);
19646	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19647	Mask.getSimpleValueType().getSizeInBits());
19648	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19649	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19650	DAG.getTargetConstant(0, dl, MaskVT),
19651	Subtarget, DAG);
19652	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19653	DAG.getUNDEF(BitcastVT), FPclassMask,
19654	DAG.getIntPtrConstant(0, dl));
19655	return DAG.getBitcast(Op.getValueType(), Res);
19656	}
19657	case FPCLASSS: {
19658	SDValue Src1 = Op.getOperand(1);
19659	SDValue Imm = Op.getOperand(2);
19660	SDValue Mask = Op.getOperand(3);
19661	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19662	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19663	DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19664	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19665	DAG.getIntPtrConstant(0, dl));
19666	}
19667	case CMP_MASK:
19668	case CMP_MASK_CC: {
19669	// Comparison intrinsics with masks.
19670	// Example of transformation:
19671	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
19672	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19673	// (i8 (bitcast
19674	// (v8i1 (insert_subvector undef,
19675	// (v2i1 (and (PCMPEQM %a, %b),
19676	// (extract_subvector
19677	// (v8i1 (bitcast %mask)), 0))), 0))))
19678	MVT VT = Op.getOperand(1).getSimpleValueType();
19679	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19680	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19681	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19682	Mask.getSimpleValueType().getSizeInBits());
19683	SDValue Cmp;
19684	if (IntrData->Type == CMP_MASK_CC) {
19685	SDValue CC = Op.getOperand(3);
19686	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19687	// We specify 2 possible opcodes for intrinsics with rounding modes.
19688	// First, we check if the intrinsic may have non-default rounding mode,
19689	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
19690	if (IntrData->Opc1 != 0) {
19691	SDValue Rnd = Op.getOperand(5);
19692	if (!isRoundModeCurDirection(Rnd))
19693	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19694	Op.getOperand(2), CC, Rnd);
19695	}
19696	//default rounding mode
19697	if(!Cmp.getNode())
19698	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19699	Op.getOperand(2), CC);
19700
19701	} else {
19702	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!")((IntrData->Type == CMP_MASK && "Unexpected intrinsic type!" ) ? static_cast<void> (0) : __assert_fail ("IntrData->Type == CMP_MASK && \"Unexpected intrinsic type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19702, __PRETTY_FUNCTION__));
19703	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19704	Op.getOperand(2));
19705	}
19706	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19707	DAG.getTargetConstant(0, dl,
19708	MaskVT),
19709	Subtarget, DAG);
19710	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19711	DAG.getUNDEF(BitcastVT), CmpMask,
19712	DAG.getIntPtrConstant(0, dl));
19713	return DAG.getBitcast(Op.getValueType(), Res);
19714	}
19715	case CMP_MASK_SCALAR_CC: {
19716	SDValue Src1 = Op.getOperand(1);
19717	SDValue Src2 = Op.getOperand(2);
19718	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19719	SDValue Mask = Op.getOperand(4);
19720
19721	SDValue Cmp;
19722	if (IntrData->Opc1 != 0) {
19723	SDValue Rnd = Op.getOperand(5);
19724	if (!isRoundModeCurDirection(Rnd))
19725	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19726	}
19727	//default rounding mode
19728	if(!Cmp.getNode())
19729	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19730
19731	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19732	DAG.getTargetConstant(0, dl,
19733	MVT::i1),
19734	Subtarget, DAG);
19735	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19736	DAG.getIntPtrConstant(0, dl));
19737	}
19738	case COMI: { // Comparison intrinsics
19739	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19740	SDValue LHS = Op.getOperand(1);
19741	SDValue RHS = Op.getOperand(2);
19742	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19743	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19744	SDValue SetCC;
19745	switch (CC) {
19746	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19747	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19748	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19749	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19750	break;
19751	}
19752	case ISD::SETNE: { // (ZF = 1 or PF = 1)
19753	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19754	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19755	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19756	break;
19757	}
19758	case ISD::SETGT: // (CF = 0 and ZF = 0)
19759	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19760	break;
19761	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19762	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19763	break;
19764	}
19765	case ISD::SETGE: // CF = 0
19766	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19767	break;
19768	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19769	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19770	break;
19771	default:
19772	llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19772);
19773	}
19774	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19775	}
19776	case COMI_RM: { // Comparison intrinsics with Sae
19777	SDValue LHS = Op.getOperand(1);
19778	SDValue RHS = Op.getOperand(2);
19779	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19780	SDValue Sae = Op.getOperand(4);
19781
19782	SDValue FCmp;
19783	if (isRoundModeCurDirection(Sae))
19784	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19785	DAG.getConstant(CondVal, dl, MVT::i8));
19786	else
19787	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19788	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19789	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19790	DAG.getIntPtrConstant(0, dl));
19791	}
19792	case VSHIFT:
19793	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19794	Op.getOperand(1), Op.getOperand(2), Subtarget,
19795	DAG);
19796	case COMPRESS_EXPAND_IN_REG: {
19797	SDValue Mask = Op.getOperand(3);
19798	SDValue DataToCompress = Op.getOperand(1);
19799	SDValue PassThru = Op.getOperand(2);
19800	if (isAllOnesConstant(Mask)) // return data as is
19801	return Op.getOperand(1);
19802
19803	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19804	DataToCompress),
19805	Mask, PassThru, Subtarget, DAG);
19806	}
19807	case BROADCASTM: {
19808	SDValue Mask = Op.getOperand(1);
19809	MVT MaskVT = MVT::getVectorVT(MVT::i1,
19810	Mask.getSimpleValueType().getSizeInBits());
19811	Mask = DAG.getBitcast(MaskVT, Mask);
19812	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19813	}
19814	case KUNPCK: {
19815	MVT VT = Op.getSimpleValueType();
19816	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19817
19818	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19819	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19820	// Arguments should be swapped.
19821	SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19822	MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19823	Src2, Src1);
19824	return DAG.getBitcast(VT, Res);
19825	}
19826	case MASK_BINOP: {
19827	MVT VT = Op.getSimpleValueType();
19828	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19829
19830	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19831	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19832	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19833	return DAG.getBitcast(VT, Res);
19834	}
19835	case FIXUPIMMS:
19836	case FIXUPIMMS_MASKZ:
19837	case FIXUPIMM:
19838	case FIXUPIMM_MASKZ:{
19839	SDValue Src1 = Op.getOperand(1);
19840	SDValue Src2 = Op.getOperand(2);
19841	SDValue Src3 = Op.getOperand(3);
19842	SDValue Imm = Op.getOperand(4);
19843	SDValue Mask = Op.getOperand(5);
19844	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
19845	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19846	// We specify 2 possible modes for intrinsics, with/without rounding
19847	// modes.
19848	// First, we check if the intrinsic have rounding mode (7 operands),
19849	// if not, we set rounding mode to "current".
19850	SDValue Rnd;
19851	if (Op.getNumOperands() == 7)
19852	Rnd = Op.getOperand(6);
19853	else
19854	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19855	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
19856	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19857	Src1, Src2, Src3, Imm, Rnd),
19858	Mask, Passthru, Subtarget, DAG);
19859	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19860	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19861	Src1, Src2, Src3, Imm, Rnd),
19862	Mask, Passthru, Subtarget, DAG);
19863	}
19864	case CONVERT_TO_MASK: {
19865	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19866	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19867	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19868
19869	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19870	Op.getOperand(1));
19871	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19872	DAG.getUNDEF(BitcastVT), CvtMask,
19873	DAG.getIntPtrConstant(0, dl));
19874	return DAG.getBitcast(Op.getValueType(), Res);
19875	}
19876	case BRCST_SUBVEC_TO_VEC: {
19877	SDValue Src = Op.getOperand(1);
19878	SDValue Passthru = Op.getOperand(2);
19879	SDValue Mask = Op.getOperand(3);
19880	EVT resVT = Passthru.getValueType();
19881	SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19882	DAG.getUNDEF(resVT), Src,
19883	DAG.getIntPtrConstant(0, dl));
19884	SDValue immVal;
19885	if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19886	immVal = DAG.getConstant(0x44, dl, MVT::i8);
19887	else
19888	immVal = DAG.getConstant(0, dl, MVT::i8);
19889	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19890	subVec, subVec, immVal),
19891	Mask, Passthru, Subtarget, DAG);
19892	}
19893	case BRCST32x2_TO_VEC: {
19894	SDValue Src = Op.getOperand(1);
19895	SDValue PassThru = Op.getOperand(2);
19896	SDValue Mask = Op.getOperand(3);
19897
19898	assert((VT.getScalarType() == MVT::i32 \|\|(((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT ::f32) && "Unexpected type!") ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::f32) && \"Unexpected type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19899, __PRETTY_FUNCTION__))
19899	VT.getScalarType() == MVT::f32) && "Unexpected type!")(((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT ::f32) && "Unexpected type!") ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::f32) && \"Unexpected type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19899, __PRETTY_FUNCTION__));
19900	//bitcast Src to packed 64
19901	MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19902	MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19903	Src = DAG.getBitcast(BitcastVT, Src);
19904
19905	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19906	Mask, PassThru, Subtarget, DAG);
19907	}
19908	default:
19909	break;
19910	}
19911	}
19912
19913	switch (IntNo) {
19914	default: return SDValue(); // Don't custom lower most intrinsics.
19915
19916	case Intrinsic::x86_avx2_permd:
19917	case Intrinsic::x86_avx2_permps:
19918	// Operands intentionally swapped. Mask is last operand to intrinsic,
19919	// but second operand for node/instruction.
19920	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19921	Op.getOperand(2), Op.getOperand(1));
19922
19923	// ptest and testp intrinsics. The intrinsic these come from are designed to
19924	// return an integer value, not just an instruction so lower it to the ptest
19925	// or testp pattern and a setcc for the result.
19926	case Intrinsic::x86_sse41_ptestz:
19927	case Intrinsic::x86_sse41_ptestc:
19928	case Intrinsic::x86_sse41_ptestnzc:
19929	case Intrinsic::x86_avx_ptestz_256:
19930	case Intrinsic::x86_avx_ptestc_256:
19931	case Intrinsic::x86_avx_ptestnzc_256:
19932	case Intrinsic::x86_avx_vtestz_ps:
19933	case Intrinsic::x86_avx_vtestc_ps:
19934	case Intrinsic::x86_avx_vtestnzc_ps:
19935	case Intrinsic::x86_avx_vtestz_pd:
19936	case Intrinsic::x86_avx_vtestc_pd:
19937	case Intrinsic::x86_avx_vtestnzc_pd:
19938	case Intrinsic::x86_avx_vtestz_ps_256:
19939	case Intrinsic::x86_avx_vtestc_ps_256:
19940	case Intrinsic::x86_avx_vtestnzc_ps_256:
19941	case Intrinsic::x86_avx_vtestz_pd_256:
19942	case Intrinsic::x86_avx_vtestc_pd_256:
19943	case Intrinsic::x86_avx_vtestnzc_pd_256: {
19944	bool IsTestPacked = false;
19945	X86::CondCode X86CC;
19946	switch (IntNo) {
19947	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering." , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 19947);
19948	case Intrinsic::x86_avx_vtestz_ps:
19949	case Intrinsic::x86_avx_vtestz_pd:
19950	case Intrinsic::x86_avx_vtestz_ps_256:
19951	case Intrinsic::x86_avx_vtestz_pd_256:
19952	IsTestPacked = true;
19953	LLVM_FALLTHROUGH[[clang::fallthrough]];
19954	case Intrinsic::x86_sse41_ptestz:
19955	case Intrinsic::x86_avx_ptestz_256:
19956	// ZF = 1
19957	X86CC = X86::COND_E;
19958	break;
19959	case Intrinsic::x86_avx_vtestc_ps:
19960	case Intrinsic::x86_avx_vtestc_pd:
19961	case Intrinsic::x86_avx_vtestc_ps_256:
19962	case Intrinsic::x86_avx_vtestc_pd_256:
19963	IsTestPacked = true;
19964	LLVM_FALLTHROUGH[[clang::fallthrough]];
19965	case Intrinsic::x86_sse41_ptestc:
19966	case Intrinsic::x86_avx_ptestc_256:
19967	// CF = 1
19968	X86CC = X86::COND_B;
19969	break;
19970	case Intrinsic::x86_avx_vtestnzc_ps:
19971	case Intrinsic::x86_avx_vtestnzc_pd:
19972	case Intrinsic::x86_avx_vtestnzc_ps_256:
19973	case Intrinsic::x86_avx_vtestnzc_pd_256:
19974	IsTestPacked = true;
19975	LLVM_FALLTHROUGH[[clang::fallthrough]];
19976	case Intrinsic::x86_sse41_ptestnzc:
19977	case Intrinsic::x86_avx_ptestnzc_256:
19978	// ZF and CF = 0
19979	X86CC = X86::COND_A;
19980	break;
19981	}
19982
19983	SDValue LHS = Op.getOperand(1);
19984	SDValue RHS = Op.getOperand(2);
19985	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19986	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19987	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19988	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19989	}
19990	case Intrinsic::x86_avx512_kortestz_w:
19991	case Intrinsic::x86_avx512_kortestc_w: {
19992	X86::CondCode X86CC =
19993	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19994	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19995	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19996	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19997	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19998	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19999	}
20000
20001	case Intrinsic::x86_avx512_knot_w: {
20002	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20003	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20004	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20005	return DAG.getBitcast(MVT::i16, Res);
20006	}
20007
20008	case Intrinsic::x86_avx512_kandn_w: {
20009	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20010	// Invert LHS for the not.
20011	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20012	DAG.getConstant(1, dl, MVT::v16i1));
20013	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20014	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20015	return DAG.getBitcast(MVT::i16, Res);
20016	}
20017
20018	case Intrinsic::x86_avx512_kxnor_w: {
20019	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20020	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20021	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20022	// Invert result for the not.
20023	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20024	DAG.getConstant(1, dl, MVT::v16i1));
20025	return DAG.getBitcast(MVT::i16, Res);
20026	}
20027
20028	case Intrinsic::x86_sse42_pcmpistria128:
20029	case Intrinsic::x86_sse42_pcmpestria128:
20030	case Intrinsic::x86_sse42_pcmpistric128:
20031	case Intrinsic::x86_sse42_pcmpestric128:
20032	case Intrinsic::x86_sse42_pcmpistrio128:
20033	case Intrinsic::x86_sse42_pcmpestrio128:
20034	case Intrinsic::x86_sse42_pcmpistris128:
20035	case Intrinsic::x86_sse42_pcmpestris128:
20036	case Intrinsic::x86_sse42_pcmpistriz128:
20037	case Intrinsic::x86_sse42_pcmpestriz128: {
20038	unsigned Opcode;
20039	X86::CondCode X86CC;
20040	switch (IntNo) {
20041	default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20041); // Can't reach here.
20042	case Intrinsic::x86_sse42_pcmpistria128:
20043	Opcode = X86ISD::PCMPISTRI;
20044	X86CC = X86::COND_A;
20045	break;
20046	case Intrinsic::x86_sse42_pcmpestria128:
20047	Opcode = X86ISD::PCMPESTRI;
20048	X86CC = X86::COND_A;
20049	break;
20050	case Intrinsic::x86_sse42_pcmpistric128:
20051	Opcode = X86ISD::PCMPISTRI;
20052	X86CC = X86::COND_B;
20053	break;
20054	case Intrinsic::x86_sse42_pcmpestric128:
20055	Opcode = X86ISD::PCMPESTRI;
20056	X86CC = X86::COND_B;
20057	break;
20058	case Intrinsic::x86_sse42_pcmpistrio128:
20059	Opcode = X86ISD::PCMPISTRI;
20060	X86CC = X86::COND_O;
20061	break;
20062	case Intrinsic::x86_sse42_pcmpestrio128:
20063	Opcode = X86ISD::PCMPESTRI;
20064	X86CC = X86::COND_O;
20065	break;
20066	case Intrinsic::x86_sse42_pcmpistris128:
20067	Opcode = X86ISD::PCMPISTRI;
20068	X86CC = X86::COND_S;
20069	break;
20070	case Intrinsic::x86_sse42_pcmpestris128:
20071	Opcode = X86ISD::PCMPESTRI;
20072	X86CC = X86::COND_S;
20073	break;
20074	case Intrinsic::x86_sse42_pcmpistriz128:
20075	Opcode = X86ISD::PCMPISTRI;
20076	X86CC = X86::COND_E;
20077	break;
20078	case Intrinsic::x86_sse42_pcmpestriz128:
20079	Opcode = X86ISD::PCMPESTRI;
20080	X86CC = X86::COND_E;
20081	break;
20082	}
20083	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20084	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20085	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20086	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20087	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20088	}
20089
20090	case Intrinsic::x86_sse42_pcmpistri128:
20091	case Intrinsic::x86_sse42_pcmpestri128: {
20092	unsigned Opcode;
20093	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20094	Opcode = X86ISD::PCMPISTRI;
20095	else
20096	Opcode = X86ISD::PCMPESTRI;
20097
20098	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20099	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20100	return DAG.getNode(Opcode, dl, VTs, NewOps);
20101	}
20102
20103	case Intrinsic::eh_sjlj_lsda: {
20104	MachineFunction &MF = DAG.getMachineFunction();
20105	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20106	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20107	auto &Context = MF.getMMI().getContext();
20108	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20109	Twine(MF.getFunctionNumber()));
20110	return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20111	}
20112
20113	case Intrinsic::x86_seh_lsda: {
20114	// Compute the symbol for the LSDA. We know it'll get emitted later.
20115	MachineFunction &MF = DAG.getMachineFunction();
20116	SDValue Op1 = Op.getOperand(1);
20117	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20118	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20119	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20120
20121	// Generate a simple absolute symbol reference. This intrinsic is only
20122	// supported on 32-bit Windows, which isn't PIC.
20123	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20124	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20125	}
20126
20127	case Intrinsic::x86_seh_recoverfp: {
20128	SDValue FnOp = Op.getOperand(1);
20129	SDValue IncomingFPOp = Op.getOperand(2);
20130	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20131	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20132	if (!Fn)
20133	report_fatal_error(
20134	"llvm.x86.seh.recoverfp must take a function as the first argument");
20135	return recoverFramePointer(DAG, Fn, IncomingFPOp);
20136	}
20137
20138	case Intrinsic::localaddress: {
20139	// Returns one of the stack, base, or frame pointer registers, depending on
20140	// which is used to reference local variables.
20141	MachineFunction &MF = DAG.getMachineFunction();
20142	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20143	unsigned Reg;
20144	if (RegInfo->hasBasePointer(MF))
20145	Reg = RegInfo->getBaseRegister();
20146	else // This function handles the SP or FP case.
20147	Reg = RegInfo->getPtrSizedFrameRegister(MF);
20148	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20149	}
20150	}
20151	}
20152
20153	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20154	SDValue Src, SDValue Mask, SDValue Base,
20155	SDValue Index, SDValue ScaleOp, SDValue Chain,
20156	const X86Subtarget &Subtarget) {
20157	SDLoc dl(Op);
20158	auto *C = cast<ConstantSDNode>(ScaleOp);
20159	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20160	EVT MaskVT = Mask.getValueType();
20161	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20162	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20163	SDValue Segment = DAG.getRegister(0, MVT::i32);
20164	// If source is undef or we know it won't be used, use a zero vector
20165	// to break register dependency.
20166	// TODO: use undef instead and let ExecutionDepsFix deal with it?
20167	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
20168	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20169	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20170	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20171	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20172	return DAG.getMergeValues(RetOps, dl);
20173	}
20174
20175	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20176	SDValue Src, SDValue Mask, SDValue Base,
20177	SDValue Index, SDValue ScaleOp, SDValue Chain,
20178	const X86Subtarget &Subtarget) {
20179	SDLoc dl(Op);
20180	auto *C = cast<ConstantSDNode>(ScaleOp);
20181	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20182	MVT MaskVT = MVT::getVectorVT(MVT::i1,
20183	Index.getSimpleValueType().getVectorNumElements());
20184
20185	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20186	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20187	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20188	SDValue Segment = DAG.getRegister(0, MVT::i32);
20189	// If source is undef or we know it won't be used, use a zero vector
20190	// to break register dependency.
20191	// TODO: use undef instead and let ExecutionDepsFix deal with it?
20192	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
20193	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20194	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20195	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20196	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20197	return DAG.getMergeValues(RetOps, dl);
20198	}
20199
20200	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20201	SDValue Src, SDValue Mask, SDValue Base,
20202	SDValue Index, SDValue ScaleOp, SDValue Chain,
20203	const X86Subtarget &Subtarget) {
20204	SDLoc dl(Op);
20205	auto *C = cast<ConstantSDNode>(ScaleOp);
20206	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20207	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20208	SDValue Segment = DAG.getRegister(0, MVT::i32);
20209	MVT MaskVT = MVT::getVectorVT(MVT::i1,
20210	Index.getSimpleValueType().getVectorNumElements());
20211
20212	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20213	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20214	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20215	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20216	return SDValue(Res, 1);
20217	}
20218
20219	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20220	SDValue Mask, SDValue Base, SDValue Index,
20221	SDValue ScaleOp, SDValue Chain,
20222	const X86Subtarget &Subtarget) {
20223	SDLoc dl(Op);
20224	auto *C = cast<ConstantSDNode>(ScaleOp);
20225	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20226	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20227	SDValue Segment = DAG.getRegister(0, MVT::i32);
20228	MVT MaskVT =
20229	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20230	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20231	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20232	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20233	return SDValue(Res, 0);
20234	}
20235
20236	/// Handles the lowering of builtin intrinsic that return the value
20237	/// of the extended control register.
20238	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20239	SelectionDAG &DAG,
20240	const X86Subtarget &Subtarget,
20241	SmallVectorImpl<SDValue> &Results) {
20242	assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!" ) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20242, __PRETTY_FUNCTION__));
20243	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20244	SDValue LO, HI;
20245
20246	// The ECX register is used to select the index of the XCR register to
20247	// return.
20248	SDValue Chain =
20249	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20250	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20251	Chain = SDValue(N1, 0);
20252
20253	// Reads the content of XCR and returns it in registers EDX:EAX.
20254	if (Subtarget.is64Bit()) {
20255	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20256	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20257	LO.getValue(2));
20258	} else {
20259	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20260	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20261	LO.getValue(2));
20262	}
20263	Chain = HI.getValue(1);
20264
20265	if (Subtarget.is64Bit()) {
20266	// Merge the two 32-bit values into a 64-bit one..
20267	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20268	DAG.getConstant(32, DL, MVT::i8));
20269	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20270	Results.push_back(Chain);
20271	return;
20272	}
20273
20274	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
20275	SDValue Ops[] = { LO, HI };
20276	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20277	Results.push_back(Pair);
20278	Results.push_back(Chain);
20279	}
20280
20281	/// Handles the lowering of builtin intrinsics that read performance monitor
20282	/// counters (x86_rdpmc).
20283	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20284	SelectionDAG &DAG,
20285	const X86Subtarget &Subtarget,
20286	SmallVectorImpl<SDValue> &Results) {
20287	assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!" ) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20287, __PRETTY_FUNCTION__));
20288	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20289	SDValue LO, HI;
20290
20291	// The ECX register is used to select the index of the performance counter
20292	// to read.
20293	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20294	N->getOperand(2));
20295	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20296
20297	// Reads the content of a 64-bit performance counter and returns it in the
20298	// registers EDX:EAX.
20299	if (Subtarget.is64Bit()) {
20300	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20301	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20302	LO.getValue(2));
20303	} else {
20304	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20305	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20306	LO.getValue(2));
20307	}
20308	Chain = HI.getValue(1);
20309
20310	if (Subtarget.is64Bit()) {
20311	// The EAX register is loaded with the low-order 32 bits. The EDX register
20312	// is loaded with the supported high-order bits of the counter.
20313	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20314	DAG.getConstant(32, DL, MVT::i8));
20315	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20316	Results.push_back(Chain);
20317	return;
20318	}
20319
20320	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
20321	SDValue Ops[] = { LO, HI };
20322	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20323	Results.push_back(Pair);
20324	Results.push_back(Chain);
20325	}
20326
20327	/// Handles the lowering of builtin intrinsics that read the time stamp counter
20328	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20329	/// READCYCLECOUNTER nodes.
20330	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20331	SelectionDAG &DAG,
20332	const X86Subtarget &Subtarget,
20333	SmallVectorImpl<SDValue> &Results) {
20334	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20335	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20336	SDValue LO, HI;
20337
20338	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
20339	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20340	// and the EAX register is loaded with the low-order 32 bits.
20341	if (Subtarget.is64Bit()) {
20342	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20343	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20344	LO.getValue(2));
20345	} else {
20346	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20347	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20348	LO.getValue(2));
20349	}
20350	SDValue Chain = HI.getValue(1);
20351
20352	if (Opcode == X86ISD::RDTSCP_DAG) {
20353	assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!" ) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20353, __PRETTY_FUNCTION__));
20354
20355	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20356	// the ECX register. Add 'ecx' explicitly to the chain.
20357	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20358	HI.getValue(2));
20359	// Explicitly store the content of ECX at the location passed in input
20360	// to the 'rdtscp' intrinsic.
20361	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20362	MachinePointerInfo());
20363	}
20364
20365	if (Subtarget.is64Bit()) {
20366	// The EDX register is loaded with the high-order 32 bits of the MSR, and
20367	// the EAX register is loaded with the low-order 32 bits.
20368	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20369	DAG.getConstant(32, DL, MVT::i8));
20370	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20371	Results.push_back(Chain);
20372	return;
20373	}
20374
20375	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
20376	SDValue Ops[] = { LO, HI };
20377	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20378	Results.push_back(Pair);
20379	Results.push_back(Chain);
20380	}
20381
20382	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20383	SelectionDAG &DAG) {
20384	SmallVector<SDValue, 2> Results;
20385	SDLoc DL(Op);
20386	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20387	Results);
20388	return DAG.getMergeValues(Results, DL);
20389	}
20390
20391	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20392	MachineFunction &MF = DAG.getMachineFunction();
20393	SDValue Chain = Op.getOperand(0);
20394	SDValue RegNode = Op.getOperand(2);
20395	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20396	if (!EHInfo)
20397	report_fatal_error("EH registrations only live in functions using WinEH");
20398
20399	// Cast the operand to an alloca, and remember the frame index.
20400	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20401	if (!FINode)
20402	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20403	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20404
20405	// Return the chain operand without making any DAG nodes.
20406	return Chain;
20407	}
20408
20409	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20410	MachineFunction &MF = DAG.getMachineFunction();
20411	SDValue Chain = Op.getOperand(0);
20412	SDValue EHGuard = Op.getOperand(2);
20413	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20414	if (!EHInfo)
20415	report_fatal_error("EHGuard only live in functions using WinEH");
20416
20417	// Cast the operand to an alloca, and remember the frame index.
20418	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20419	if (!FINode)
20420	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20421	EHInfo->EHGuardFrameIndex = FINode->getIndex();
20422
20423	// Return the chain operand without making any DAG nodes.
20424	return Chain;
20425	}
20426
20427	/// Emit Truncating Store with signed or unsigned saturation.
20428	static SDValue
20429	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20430	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20431	SelectionDAG &DAG) {
20432
20433	SDVTList VTs = DAG.getVTList(MVT::Other);
20434	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20435	SDValue Ops[] = { Chain, Val, Ptr, Undef };
20436	return SignedSat ?
20437	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20438	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20439	}
20440
20441	/// Emit Masked Truncating Store with signed or unsigned saturation.
20442	static SDValue
20443	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20444	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20445	MachineMemOperand *MMO, SelectionDAG &DAG) {
20446
20447	SDVTList VTs = DAG.getVTList(MVT::Other);
20448	SDValue Ops[] = { Chain, Ptr, Mask, Val };
20449	return SignedSat ?
20450	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20451	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20452	}
20453
20454	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20455	SelectionDAG &DAG) {
20456	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20457
20458	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20459	if (!IntrData) {
20460	switch (IntNo) {
20461	case llvm::Intrinsic::x86_seh_ehregnode:
20462	return MarkEHRegistrationNode(Op, DAG);
20463	case llvm::Intrinsic::x86_seh_ehguard:
20464	return MarkEHGuard(Op, DAG);
20465	case llvm::Intrinsic::x86_flags_read_u32:
20466	case llvm::Intrinsic::x86_flags_read_u64:
20467	case llvm::Intrinsic::x86_flags_write_u32:
20468	case llvm::Intrinsic::x86_flags_write_u64: {
20469	// We need a frame pointer because this will get lowered to a PUSH/POP
20470	// sequence.
20471	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20472	MFI.setHasCopyImplyingStackAdjustment(true);
20473	// Don't do anything here, we will expand these intrinsics out later
20474	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
20475	return SDValue();
20476	}
20477	case Intrinsic::x86_lwpins32:
20478	case Intrinsic::x86_lwpins64: {
20479	SDLoc dl(Op);
20480	SDValue Chain = Op->getOperand(0);
20481	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20482	SDValue LwpIns =
20483	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20484	Op->getOperand(3), Op->getOperand(4));
20485	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20486	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20487	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20488	LwpIns.getValue(1));
20489	}
20490	}
20491	return SDValue();
20492	}
20493
20494	SDLoc dl(Op);
20495	switch(IntrData->Type) {
20496	default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20496);
20497	case RDSEED:
20498	case RDRAND: {
20499	// Emit the node with the right value type.
20500	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20501	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20502
20503	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20504	// Otherwise return the value from Rand, which is always 0, casted to i32.
20505	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20506	DAG.getConstant(1, dl, Op->getValueType(1)),
20507	DAG.getConstant(X86::COND_B, dl, MVT::i32),
20508	SDValue(Result.getNode(), 1) };
20509	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20510	DAG.getVTList(Op->getValueType(1), MVT::Glue),
20511	Ops);
20512
20513	// Return { result, isValid, chain }.
20514	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20515	SDValue(Result.getNode(), 2));
20516	}
20517	case GATHER_AVX2: {
20518	SDValue Chain = Op.getOperand(0);
20519	SDValue Src = Op.getOperand(2);
20520	SDValue Base = Op.getOperand(3);
20521	SDValue Index = Op.getOperand(4);
20522	SDValue Mask = Op.getOperand(5);
20523	SDValue Scale = Op.getOperand(6);
20524	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20525	Scale, Chain, Subtarget);
20526	}
20527	case GATHER: {
20528	//gather(v1, mask, index, base, scale);
20529	SDValue Chain = Op.getOperand(0);
20530	SDValue Src = Op.getOperand(2);
20531	SDValue Base = Op.getOperand(3);
20532	SDValue Index = Op.getOperand(4);
20533	SDValue Mask = Op.getOperand(5);
20534	SDValue Scale = Op.getOperand(6);
20535	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20536	Chain, Subtarget);
20537	}
20538	case SCATTER: {
20539	//scatter(base, mask, index, v1, scale);
20540	SDValue Chain = Op.getOperand(0);
20541	SDValue Base = Op.getOperand(2);
20542	SDValue Mask = Op.getOperand(3);
20543	SDValue Index = Op.getOperand(4);
20544	SDValue Src = Op.getOperand(5);
20545	SDValue Scale = Op.getOperand(6);
20546	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20547	Scale, Chain, Subtarget);
20548	}
20549	case PREFETCH: {
20550	SDValue Hint = Op.getOperand(6);
20551	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20552	assert((HintVal == 2 \|\| HintVal == 3) &&(((HintVal == 2 \|\| HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3" ) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 \|\| HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20553, __PRETTY_FUNCTION__))
20553	"Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 \|\| HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3" ) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 \|\| HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20553, __PRETTY_FUNCTION__));
20554	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20555	SDValue Chain = Op.getOperand(0);
20556	SDValue Mask = Op.getOperand(2);
20557	SDValue Index = Op.getOperand(3);
20558	SDValue Base = Op.getOperand(4);
20559	SDValue Scale = Op.getOperand(5);
20560	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20561	Subtarget);
20562	}
20563	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20564	case RDTSC: {
20565	SmallVector<SDValue, 2> Results;
20566	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20567	Results);
20568	return DAG.getMergeValues(Results, dl);
20569	}
20570	// Read Performance Monitoring Counters.
20571	case RDPMC: {
20572	SmallVector<SDValue, 2> Results;
20573	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20574	return DAG.getMergeValues(Results, dl);
20575	}
20576	// Get Extended Control Register.
20577	case XGETBV: {
20578	SmallVector<SDValue, 2> Results;
20579	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20580	return DAG.getMergeValues(Results, dl);
20581	}
20582	// XTEST intrinsics.
20583	case XTEST: {
20584	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20585	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20586
20587	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20588	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20589	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20590	Ret, SDValue(InTrans.getNode(), 1));
20591	}
20592	// ADC/ADCX/SBB
20593	case ADX: {
20594	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20595	SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20596	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20597	DAG.getConstant(-1, dl, MVT::i8));
20598	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20599	Op.getOperand(4), GenCF.getValue(1));
20600	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20601	Op.getOperand(5), MachinePointerInfo());
20602	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20603	SDValue Results[] = { SetCC, Store };
20604	return DAG.getMergeValues(Results, dl);
20605	}
20606	case COMPRESS_TO_MEM: {
20607	SDValue Mask = Op.getOperand(4);
20608	SDValue DataToCompress = Op.getOperand(3);
20609	SDValue Addr = Op.getOperand(2);
20610	SDValue Chain = Op.getOperand(0);
20611	MVT VT = DataToCompress.getSimpleValueType();
20612
20613	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20614	assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast <void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20614, __PRETTY_FUNCTION__));
20615
20616	if (isAllOnesConstant(Mask)) // return just a store
20617	return DAG.getStore(Chain, dl, DataToCompress, Addr,
20618	MemIntr->getMemOperand());
20619
20620	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20621	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20622
20623	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20624	MemIntr->getMemOperand(),
20625	false /* truncating /, true / compressing */);
20626	}
20627	case TRUNCATE_TO_MEM_VI8:
20628	case TRUNCATE_TO_MEM_VI16:
20629	case TRUNCATE_TO_MEM_VI32: {
20630	SDValue Mask = Op.getOperand(4);
20631	SDValue DataToTruncate = Op.getOperand(3);
20632	SDValue Addr = Op.getOperand(2);
20633	SDValue Chain = Op.getOperand(0);
20634
20635	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20636	assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast <void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20636, __PRETTY_FUNCTION__));
20637
20638	EVT MemVT = MemIntr->getMemoryVT();
20639
20640	uint16_t TruncationOp = IntrData->Opc0;
20641	switch (TruncationOp) {
20642	case X86ISD::VTRUNC: {
20643	if (isAllOnesConstant(Mask)) // return just a truncate store
20644	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20645	MemIntr->getMemOperand());
20646
20647	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20648	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20649
20650	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20651	MemIntr->getMemOperand(), true /* truncating */);
20652	}
20653	case X86ISD::VTRUNCUS:
20654	case X86ISD::VTRUNCS: {
20655	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20656	if (isAllOnesConstant(Mask))
20657	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20658	MemIntr->getMemOperand(), DAG);
20659
20660	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20661	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20662
20663	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20664	VMask, MemVT, MemIntr->getMemOperand(), DAG);
20665	}
20666	default:
20667	llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20667);
20668	}
20669	}
20670
20671	case EXPAND_FROM_MEM: {
20672	SDValue Mask = Op.getOperand(4);
20673	SDValue PassThru = Op.getOperand(3);
20674	SDValue Addr = Op.getOperand(2);
20675	SDValue Chain = Op.getOperand(0);
20676	MVT VT = Op.getSimpleValueType();
20677
20678	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20679	assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast <void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20679, __PRETTY_FUNCTION__));
20680
20681	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20682	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20683	if (X86::isZeroNode(Mask))
20684	return DAG.getUNDEF(VT);
20685
20686	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20687	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20688	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20689	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20690	true /* expanding */);
20691	}
20692	}
20693	}
20694
20695	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20696	SelectionDAG &DAG) const {
20697	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20698	MFI.setReturnAddressIsTaken(true);
20699
20700	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20701	return SDValue();
20702
20703	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20704	SDLoc dl(Op);
20705	EVT PtrVT = getPointerTy(DAG.getDataLayout());
20706
20707	if (Depth > 0) {
20708	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20709	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20710	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20711	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20712	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20713	MachinePointerInfo());
20714	}
20715
20716	// Just load the return address.
20717	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20718	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20719	MachinePointerInfo());
20720	}
20721
20722	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20723	SelectionDAG &DAG) const {
20724	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20725	return getReturnAddressFrameIndex(DAG);
20726	}
20727
20728	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20729	MachineFunction &MF = DAG.getMachineFunction();
20730	MachineFrameInfo &MFI = MF.getFrameInfo();
20731	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20732	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20733	EVT VT = Op.getValueType();
20734
20735	MFI.setFrameAddressIsTaken(true);
20736
20737	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20738	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
20739	// is not possible to crawl up the stack without looking at the unwind codes
20740	// simultaneously.
20741	int FrameAddrIndex = FuncInfo->getFAIndex();
20742	if (!FrameAddrIndex) {
20743	// Set up a frame object for the return address.
20744	unsigned SlotSize = RegInfo->getSlotSize();
20745	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20746	SlotSize, /Offset=/0, /IsImmutable=/false);
20747	FuncInfo->setFAIndex(FrameAddrIndex);
20748	}
20749	return DAG.getFrameIndex(FrameAddrIndex, VT);
20750	}
20751
20752	unsigned FrameReg =
20753	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20754	SDLoc dl(Op); // FIXME probably not meaningful
20755	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20756	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|((((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20758, __PRETTY_FUNCTION__))
20757	(FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20758, __PRETTY_FUNCTION__))
20758	"Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) \|\| (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20758, __PRETTY_FUNCTION__));
20759	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20760	while (Depth--)
20761	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20762	MachinePointerInfo());
20763	return FrameAddr;
20764	}
20765
20766	// FIXME? Maybe this could be a TableGen attribute on some registers and
20767	// this table could be generated automatically from RegInfo.
20768	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20769	SelectionDAG &DAG) const {
20770	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20771	const MachineFunction &MF = DAG.getMachineFunction();
20772
20773	unsigned Reg = StringSwitch<unsigned>(RegName)
20774	.Case("esp", X86::ESP)
20775	.Case("rsp", X86::RSP)
20776	.Case("ebp", X86::EBP)
20777	.Case("rbp", X86::RBP)
20778	.Default(0);
20779
20780	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
20781	if (!TFI.hasFP(MF))
20782	report_fatal_error("register " + StringRef(RegName) +
20783	" is allocatable: function has no frame pointer");
20784	#ifndef NDEBUG
20785	else {
20786	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20787	unsigned FrameReg =
20788	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20789	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&(((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP \|\| FrameReg == X86::RBP) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20790, __PRETTY_FUNCTION__))
20790	"Invalid Frame Register!")(((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP \|\| FrameReg == X86::RBP) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20790, __PRETTY_FUNCTION__));
20791	}
20792	#endif
20793	}
20794
20795	if (Reg)
20796	return Reg;
20797
20798	report_fatal_error("Invalid register name global variable");
20799	}
20800
20801	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20802	SelectionDAG &DAG) const {
20803	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20804	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20805	}
20806
20807	unsigned X86TargetLowering::getExceptionPointerRegister(
20808	const Constant *PersonalityFn) const {
20809	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20810	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20811
20812	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20813	}
20814
20815	unsigned X86TargetLowering::getExceptionSelectorRegister(
20816	const Constant *PersonalityFn) const {
20817	// Funclet personalities don't use selectors (the runtime does the selection).
20818	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn ))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20818, __PRETTY_FUNCTION__));
20819	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20820	}
20821
20822	bool X86TargetLowering::needsFixedCatchObjects() const {
20823	return Subtarget.isTargetWin64();
20824	}
20825
20826	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20827	SDValue Chain = Op.getOperand(0);
20828	SDValue Offset = Op.getOperand(1);
20829	SDValue Handler = Op.getOperand(2);
20830	SDLoc dl (Op);
20831
20832	EVT PtrVT = getPointerTy(DAG.getDataLayout());
20833	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20834	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20835	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|((((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20837, __PRETTY_FUNCTION__))
20836	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20837, __PRETTY_FUNCTION__))
20837	"Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!" ) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\| (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20837, __PRETTY_FUNCTION__));
20838	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20839	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20840
20841	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20842	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20843	dl));
20844	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20845	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20846	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20847
20848	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20849	DAG.getRegister(StoreAddrReg, PtrVT));
20850	}
20851
20852	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20853	SelectionDAG &DAG) const {
20854	SDLoc DL(Op);
20855	// If the subtarget is not 64bit, we may need the global base reg
20856	// after isel expand pseudo, i.e., after CGBR pass ran.
20857	// Therefore, ask for the GlobalBaseReg now, so that the pass
20858	// inserts the code for us in case we need it.
20859	// Otherwise, we will end up in a situation where we will
20860	// reference a virtual register that is not defined!
20861	if (!Subtarget.is64Bit()) {
20862	const X86InstrInfo *TII = Subtarget.getInstrInfo();
20863	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20864	}
20865	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20866	DAG.getVTList(MVT::i32, MVT::Other),
20867	Op.getOperand(0), Op.getOperand(1));
20868	}
20869
20870	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20871	SelectionDAG &DAG) const {
20872	SDLoc DL(Op);
20873	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20874	Op.getOperand(0), Op.getOperand(1));
20875	}
20876
20877	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20878	SelectionDAG &DAG) const {
20879	SDLoc DL(Op);
20880	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20881	Op.getOperand(0));
20882	}
20883
20884	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20885	return Op.getOperand(0);
20886	}
20887
20888	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20889	SelectionDAG &DAG) const {
20890	SDValue Root = Op.getOperand(0);
20891	SDValue Trmp = Op.getOperand(1); // trampoline
20892	SDValue FPtr = Op.getOperand(2); // nested function
20893	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20894	SDLoc dl (Op);
20895
20896	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20897	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20898
20899	if (Subtarget.is64Bit()) {
20900	SDValue OutChains[6];
20901
20902	// Large code-model.
20903	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20904	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20905
20906	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20907	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20908
20909	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix
20910
20911	// Load the pointer to the nested function into R11.
20912	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
20913	SDValue Addr = Trmp;
20914	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20915	Addr, MachinePointerInfo(TrmpAddr));
20916
20917	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20918	DAG.getConstant(2, dl, MVT::i64));
20919	OutChains[1] =
20920	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20921	/* Alignment = */ 2);
20922
20923	// Load the 'nest' parameter value into R10.
20924	// R10 is specified in X86CallingConv.td
20925	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
20926	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20927	DAG.getConstant(10, dl, MVT::i64));
20928	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20929	Addr, MachinePointerInfo(TrmpAddr, 10));
20930
20931	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20932	DAG.getConstant(12, dl, MVT::i64));
20933	OutChains[3] =
20934	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20935	/* Alignment = */ 2);
20936
20937	// Jump to the nested function.
20938	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
20939	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20940	DAG.getConstant(20, dl, MVT::i64));
20941	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20942	Addr, MachinePointerInfo(TrmpAddr, 20));
20943
20944	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
20945	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20946	DAG.getConstant(22, dl, MVT::i64));
20947	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20948	Addr, MachinePointerInfo(TrmpAddr, 22));
20949
20950	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20951	} else {
20952	const Function *Func =
20953	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20954	CallingConv::ID CC = Func->getCallingConv();
20955	unsigned NestReg;
20956
20957	switch (CC) {
20958	default:
20959	llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 20959);
20960	case CallingConv::C:
20961	case CallingConv::X86_StdCall: {
20962	// Pass 'nest' parameter in ECX.
20963	// Must be kept in sync with X86CallingConv.td
20964	NestReg = X86::ECX;
20965
20966	// Check that ECX wasn't needed by an 'inreg' parameter.
20967	FunctionType *FTy = Func->getFunctionType();
20968	const AttributeList &Attrs = Func->getAttributes();
20969
20970	if (!Attrs.isEmpty() && !Func->isVarArg()) {
20971	unsigned InRegCount = 0;
20972	unsigned Idx = 1;
20973
20974	for (FunctionType::param_iterator I = FTy->param_begin(),
20975	E = FTy->param_end(); I != E; ++I, ++Idx)
20976	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20977	auto &DL = DAG.getDataLayout();
20978	// FIXME: should only count parameters that are lowered to integers.
20979	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20980	}
20981
20982	if (InRegCount > 2) {
20983	report_fatal_error("Nest register in use - reduce number of inreg"
20984	" parameters!");
20985	}
20986	}
20987	break;
20988	}
20989	case CallingConv::X86_FastCall:
20990	case CallingConv::X86_ThisCall:
20991	case CallingConv::Fast:
20992	// Pass 'nest' parameter in EAX.
20993	// Must be kept in sync with X86CallingConv.td
20994	NestReg = X86::EAX;
20995	break;
20996	}
20997
20998	SDValue OutChains[4];
20999	SDValue Addr, Disp;
21000
21001	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21002	DAG.getConstant(10, dl, MVT::i32));
21003	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21004
21005	// This is storing the opcode for MOV32ri.
21006	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21007	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21008	OutChains[0] =
21009	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
21010	Trmp, MachinePointerInfo(TrmpAddr));
21011
21012	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21013	DAG.getConstant(1, dl, MVT::i32));
21014	OutChains[1] =
21015	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21016	/* Alignment = */ 1);
21017
21018	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21019	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21020	DAG.getConstant(5, dl, MVT::i32));
21021	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21022	Addr, MachinePointerInfo(TrmpAddr, 5),
21023	/* Alignment = */ 1);
21024
21025	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21026	DAG.getConstant(6, dl, MVT::i32));
21027	OutChains[3] =
21028	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21029	/* Alignment = */ 1);
21030
21031	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21032	}
21033	}
21034
21035	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21036	SelectionDAG &DAG) const {
21037	/*
21038	The rounding mode is in bits 11:10 of FPSR, and has the following
21039	settings:
21040	00 Round to nearest
21041	01 Round to -inf
21042	10 Round to +inf
21043	11 Round to 0
21044
21045	FLT_ROUNDS, on the other hand, expects the following:
21046	-1 Undefined
21047	0 Round to 0
21048	1 Round to nearest
21049	2 Round to +inf
21050	3 Round to -inf
21051
21052	To perform the conversion, we do:
21053	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
21054	*/
21055
21056	MachineFunction &MF = DAG.getMachineFunction();
21057	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21058	unsigned StackAlignment = TFI.getStackAlignment();
21059	MVT VT = Op.getSimpleValueType();
21060	SDLoc DL(Op);
21061
21062	// Save FP Control Word to stack slot
21063	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21064	SDValue StackSlot =
21065	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21066
21067	MachineMemOperand *MMO =
21068	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21069	MachineMemOperand::MOStore, 2, 2);
21070
21071	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21072	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21073	DAG.getVTList(MVT::Other),
21074	Ops, MVT::i16, MMO);
21075
21076	// Load FP Control Word from stack slot
21077	SDValue CWD =
21078	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21079
21080	// Transform as necessary
21081	SDValue CWD1 =
21082	DAG.getNode(ISD::SRL, DL, MVT::i16,
21083	DAG.getNode(ISD::AND, DL, MVT::i16,
21084	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21085	DAG.getConstant(11, DL, MVT::i8));
21086	SDValue CWD2 =
21087	DAG.getNode(ISD::SRL, DL, MVT::i16,
21088	DAG.getNode(ISD::AND, DL, MVT::i16,
21089	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21090	DAG.getConstant(9, DL, MVT::i8));
21091
21092	SDValue RetVal =
21093	DAG.getNode(ISD::AND, DL, MVT::i16,
21094	DAG.getNode(ISD::ADD, DL, MVT::i16,
21095	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21096	DAG.getConstant(1, DL, MVT::i16)),
21097	DAG.getConstant(3, DL, MVT::i16));
21098
21099	return DAG.getNode((VT.getSizeInBits() < 16 ?
21100	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21101	}
21102
21103	// Split an unary integer op into 2 half sized ops.
21104	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21105	MVT VT = Op.getSimpleValueType();
21106	unsigned NumElems = VT.getVectorNumElements();
21107	unsigned SizeInBits = VT.getSizeInBits();
21108
21109	// Extract the Lo/Hi vectors
21110	SDLoc dl(Op);
21111	SDValue Src = Op.getOperand(0);
21112	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21113	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21114
21115	MVT EltVT = VT.getVectorElementType();
21116	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21117	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21118	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21119	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21120	}
21121
21122	// Decompose 256-bit ops into smaller 128-bit ops.
21123	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21124	assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21126, __PRETTY_FUNCTION__))
21125	Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21126, __PRETTY_FUNCTION__))
21126	"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21126, __PRETTY_FUNCTION__));
21127	return LowerVectorIntUnary(Op, DAG);
21128	}
21129
21130	// Decompose 512-bit ops into smaller 256-bit ops.
21131	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21132	assert(Op.getSimpleValueType().is512BitVector() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 512-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21134, __PRETTY_FUNCTION__))
21133	Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 512-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21134, __PRETTY_FUNCTION__))
21134	"Only handle AVX 512-bit vector integer operation")((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 512-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21134, __PRETTY_FUNCTION__));
21135	return LowerVectorIntUnary(Op, DAG);
21136	}
21137
21138	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21139	//
21140	// i8/i16 vector implemented using dword LZCNT vector instruction
21141	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21142	// split the vector, perform operation on it's Lo a Hi part and
21143	// concatenate the results.
21144	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21145	assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21145, __PRETTY_FUNCTION__));
21146	SDLoc dl(Op);
21147	MVT VT = Op.getSimpleValueType();
21148	MVT EltVT = VT.getVectorElementType();
21149	unsigned NumElems = VT.getVectorNumElements();
21150
21151	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&(((EltVT == MVT::i8 \|\| EltVT == MVT::i16) && "Unsupported element type" ) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 \|\| EltVT == MVT::i16) && \"Unsupported element type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21152, __PRETTY_FUNCTION__))
21152	"Unsupported element type")(((EltVT == MVT::i8 \|\| EltVT == MVT::i16) && "Unsupported element type" ) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 \|\| EltVT == MVT::i16) && \"Unsupported element type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21152, __PRETTY_FUNCTION__));
21153
21154	// Split vector, it's Lo and Hi parts will be handled in next iteration.
21155	if (16 < NumElems)
21156	return LowerVectorIntUnary(Op, DAG);
21157
21158	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21159	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&(((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) && "Unsupported value type for operation") ? static_cast<void > (0) : __assert_fail ("(NewVT.is256BitVector() \|\| NewVT.is512BitVector()) && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21160, __PRETTY_FUNCTION__))
21160	"Unsupported value type for operation")(((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) && "Unsupported value type for operation") ? static_cast<void > (0) : __assert_fail ("(NewVT.is256BitVector() \|\| NewVT.is512BitVector()) && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21160, __PRETTY_FUNCTION__));
21161
21162	// Use native supported vector instruction vplzcntd.
21163	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21164	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21165	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21166	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21167
21168	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21169	}
21170
21171	// Lower CTLZ using a PSHUFB lookup table implementation.
21172	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21173	const X86Subtarget &Subtarget,
21174	SelectionDAG &DAG) {
21175	MVT VT = Op.getSimpleValueType();
21176	int NumElts = VT.getVectorNumElements();
21177	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21178	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21179
21180	// Per-nibble leading zero PSHUFB lookup table.
21181	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
21182	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
21183	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
21184	/* c / 0, / d / 0, / e / 0, / f */ 0};
21185
21186	SmallVector<SDValue, 64> LUTVec;
21187	for (int i = 0; i < NumBytes; ++i)
21188	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21189	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21190
21191	// Begin by bitcasting the input to byte vector, then split those bytes
21192	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21193	// If the hi input nibble is zero then we add both results together, otherwise
21194	// we just take the hi result (by masking the lo result to zero before the
21195	// add).
21196	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21197	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21198
21199	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21200	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21201	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21202	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21203	SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21204
21205	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21206	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21207	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21208	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21209
21210	// Merge result back from vXi8 back to VT, working on the lo/hi halves
21211	// of the current vector width in the same way we did for the nibbles.
21212	// If the upper half of the input element is zero then add the halves'
21213	// leading zero counts together, otherwise just use the upper half's.
21214	// Double the width of the result until we are at target width.
21215	while (CurrVT != VT) {
21216	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21217	int CurrNumElts = CurrVT.getVectorNumElements();
21218	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21219	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21220	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21221
21222	// Check if the upper half of the input element is zero.
21223	SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21224	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21225	HiZ = DAG.getBitcast(NextVT, HiZ);
21226
21227	// Move the upper/lower halves to the lower bits as we'll be extending to
21228	// NextVT. Mask the lower result to zero if HiZ is true and add the results
21229	// together.
21230	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21231	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21232	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21233	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21234	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21235	CurrVT = NextVT;
21236	}
21237
21238	return Res;
21239	}
21240
21241	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21242	const X86Subtarget &Subtarget,
21243	SelectionDAG &DAG) {
21244	MVT VT = Op.getSimpleValueType();
21245
21246	if (Subtarget.hasCDI())
21247	return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21248
21249	// Decompose 256-bit ops into smaller 128-bit ops.
21250	if (VT.is256BitVector() && !Subtarget.hasInt256())
21251	return Lower256IntUnary(Op, DAG);
21252
21253	// Decompose 512-bit ops into smaller 256-bit ops.
21254	if (VT.is512BitVector() && !Subtarget.hasBWI())
21255	return Lower512IntUnary(Op, DAG);
21256
21257	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21257, __PRETTY_FUNCTION__));
21258	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21259	}
21260
21261	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21262	SelectionDAG &DAG) {
21263	MVT VT = Op.getSimpleValueType();
21264	MVT OpVT = VT;
21265	unsigned NumBits = VT.getSizeInBits();
21266	SDLoc dl(Op);
21267	unsigned Opc = Op.getOpcode();
21268
21269	if (VT.isVector())
21270	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21271
21272	Op = Op.getOperand(0);
21273	if (VT == MVT::i8) {
21274	// Zero extend to i32 since there is not an i8 bsr.
21275	OpVT = MVT::i32;
21276	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21277	}
21278
21279	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21280	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21281	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21282
21283	if (Opc == ISD::CTLZ) {
21284	// If src is zero (i.e. bsr sets ZF), returns NumBits.
21285	SDValue Ops[] = {
21286	Op,
21287	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21288	DAG.getConstant(X86::COND_E, dl, MVT::i8),
21289	Op.getValue(1)
21290	};
21291	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21292	}
21293
21294	// Finally xor with NumBits-1.
21295	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21296	DAG.getConstant(NumBits - 1, dl, OpVT));
21297
21298	if (VT == MVT::i8)
21299	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21300	return Op;
21301	}
21302
21303	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21304	MVT VT = Op.getSimpleValueType();
21305	unsigned NumBits = VT.getScalarSizeInBits();
21306	SDLoc dl(Op);
21307
21308	if (VT.isVector()) {
21309	SDValue N0 = Op.getOperand(0);
21310	SDValue Zero = DAG.getConstant(0, dl, VT);
21311
21312	// lsb(x) = (x & -x)
21313	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21314	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21315
21316	// cttz_undef(x) = (width - 1) - ctlz(lsb)
21317	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21318	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21319	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21320	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21321	}
21322
21323	// cttz(x) = ctpop(lsb - 1)
21324	SDValue One = DAG.getConstant(1, dl, VT);
21325	return DAG.getNode(ISD::CTPOP, dl, VT,
21326	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21327	}
21328
21329	assert(Op.getOpcode() == ISD::CTTZ &&((Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21330, __PRETTY_FUNCTION__))
21330	"Only scalar CTTZ requires custom lowering")((Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering" ) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21330, __PRETTY_FUNCTION__));
21331
21332	// Issue a bsf (scan bits forward) which also sets EFLAGS.
21333	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21334	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21335
21336	// If src is zero (i.e. bsf sets ZF), returns NumBits.
21337	SDValue Ops[] = {
21338	Op,
21339	DAG.getConstant(NumBits, dl, VT),
21340	DAG.getConstant(X86::COND_E, dl, MVT::i8),
21341	Op.getValue(1)
21342	};
21343	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21344	}
21345
21346	/// Break a 256-bit integer operation into two new 128-bit ones and then
21347	/// concatenate the result back.
21348	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21349	MVT VT = Op.getSimpleValueType();
21350
21351	assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation" ) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21352, __PRETTY_FUNCTION__))
21352	"Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation" ) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21352, __PRETTY_FUNCTION__));
21353
21354	unsigned NumElems = VT.getVectorNumElements();
21355	SDLoc dl(Op);
21356
21357	// Extract the LHS vectors
21358	SDValue LHS = Op.getOperand(0);
21359	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21360	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21361
21362	// Extract the RHS vectors
21363	SDValue RHS = Op.getOperand(1);
21364	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21365	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21366
21367	MVT EltVT = VT.getVectorElementType();
21368	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21369
21370	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21371	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21372	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21373	}
21374
21375	/// Break a 512-bit integer operation into two new 256-bit ones and then
21376	/// concatenate the result back.
21377	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21378	MVT VT = Op.getSimpleValueType();
21379
21380	assert(VT.is512BitVector() && VT.isInteger() &&((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation" ) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21381, __PRETTY_FUNCTION__))
21381	"Unsupported value type for operation")((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation" ) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21381, __PRETTY_FUNCTION__));
21382
21383	unsigned NumElems = VT.getVectorNumElements();
21384	SDLoc dl(Op);
21385
21386	// Extract the LHS vectors
21387	SDValue LHS = Op.getOperand(0);
21388	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21389	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21390
21391	// Extract the RHS vectors
21392	SDValue RHS = Op.getOperand(1);
21393	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21394	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21395
21396	MVT EltVT = VT.getVectorElementType();
21397	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21398
21399	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21400	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21401	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21402	}
21403
21404	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21405	MVT VT = Op.getSimpleValueType();
21406	if (VT.getScalarType() == MVT::i1)
21407	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21408	Op.getOperand(0), Op.getOperand(1));
21409	assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21411, __PRETTY_FUNCTION__))
21410	Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21411, __PRETTY_FUNCTION__))
21411	"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21411, __PRETTY_FUNCTION__));
21412	return Lower256IntArith(Op, DAG);
21413	}
21414
21415	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21416	assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21418, __PRETTY_FUNCTION__))
21417	Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21418, __PRETTY_FUNCTION__))
21418	"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21418, __PRETTY_FUNCTION__));
21419	return Lower256IntUnary(Op, DAG);
21420	}
21421
21422	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21423	assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21425, __PRETTY_FUNCTION__))
21424	Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21425, __PRETTY_FUNCTION__))
21425	"Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType ().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21425, __PRETTY_FUNCTION__));
21426	return Lower256IntArith(Op, DAG);
21427	}
21428
21429	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21430	SelectionDAG &DAG) {
21431	SDLoc dl(Op);
21432	MVT VT = Op.getSimpleValueType();
21433
21434	if (VT.getScalarType() == MVT::i1)
21435	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21436
21437	// Decompose 256-bit ops into smaller 128-bit ops.
21438	if (VT.is256BitVector() && !Subtarget.hasInt256())
21439	return Lower256IntArith(Op, DAG);
21440
21441	SDValue A = Op.getOperand(0);
21442	SDValue B = Op.getOperand(1);
21443
21444	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21445	// vector pairs, multiply and truncate.
21446	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
21447	if (Subtarget.hasInt256()) {
21448	// For 512-bit vectors, split into 256-bit vectors to allow the
21449	// sign-extension to occur.
21450	if (VT == MVT::v64i8)
21451	return Lower512IntArith(Op, DAG);
21452
21453	// For 256-bit vectors, split into 128-bit vectors to allow the
21454	// sign-extension to occur. We don't need this on AVX512BW as we can
21455	// safely sign-extend to v32i16.
21456	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21457	return Lower256IntArith(Op, DAG);
21458
21459	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21460	return DAG.getNode(
21461	ISD::TRUNCATE, dl, VT,
21462	DAG.getNode(ISD::MUL, dl, ExVT,
21463	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21464	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21465	}
21466
21467	assert(VT == MVT::v16i8 &&((VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21468, __PRETTY_FUNCTION__))
21468	"Pre-AVX2 support only supports v16i8 multiplication")((VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21468, __PRETTY_FUNCTION__));
21469	MVT ExVT = MVT::v8i16;
21470
21471	// Extract the lo parts and sign extend to i16
21472	SDValue ALo, BLo;
21473	if (Subtarget.hasSSE41()) {
21474	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21475	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21476	} else {
21477	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21478	-1, 4, -1, 5, -1, 6, -1, 7};
21479	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21480	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21481	ALo = DAG.getBitcast(ExVT, ALo);
21482	BLo = DAG.getBitcast(ExVT, BLo);
21483	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21484	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21485	}
21486
21487	// Extract the hi parts and sign extend to i16
21488	SDValue AHi, BHi;
21489	if (Subtarget.hasSSE41()) {
21490	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21491	-1, -1, -1, -1, -1, -1, -1, -1};
21492	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21493	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21494	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21495	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21496	} else {
21497	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21498	-1, 12, -1, 13, -1, 14, -1, 15};
21499	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21500	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21501	AHi = DAG.getBitcast(ExVT, AHi);
21502	BHi = DAG.getBitcast(ExVT, BHi);
21503	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21504	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21505	}
21506
21507	// Multiply, mask the lower 8bits of the lo/hi results and pack
21508	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21509	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21510	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21511	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21512	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21513	}
21514
21515	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21516	if (VT == MVT::v4i32) {
21517	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmuldq is available!") ? static_cast <void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21518, __PRETTY_FUNCTION__))
21518	"Should not custom lower when pmuldq is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmuldq is available!") ? static_cast <void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21518, __PRETTY_FUNCTION__));
21519
21520	// Extract the odd parts.
21521	static const int UnpackMask[] = { 1, -1, 3, -1 };
21522	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21523	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21524
21525	// Multiply the even parts.
21526	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21527	// Now multiply odd parts.
21528	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21529
21530	Evens = DAG.getBitcast(VT, Evens);
21531	Odds = DAG.getBitcast(VT, Odds);
21532
21533	// Merge the two vectors back together with a shuffle. This expands into 2
21534	// shuffles.
21535	static const int ShufMask[] = { 0, 4, 2, 6 };
21536	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21537	}
21538
21539	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&(((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast <void> (0) : __assert_fail ("(VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21540, __PRETTY_FUNCTION__))
21540	"Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast <void> (0) : __assert_fail ("(VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21540, __PRETTY_FUNCTION__));
21541
21542	// 32-bit vector types used for MULDQ/MULUDQ.
21543	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21544
21545	// MULDQ returns the 64-bit result of the signed multiplication of the lower
21546	// 32-bits. We can lower with this if the sign bits stretch that far.
21547	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21548	DAG.ComputeNumSignBits(B) > 32) {
21549	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21550	DAG.getBitcast(MulVT, B));
21551	}
21552
21553	// Ahi = psrlqi(a, 32);
21554	// Bhi = psrlqi(b, 32);
21555	//
21556	// AloBlo = pmuludq(a, b);
21557	// AloBhi = pmuludq(a, Bhi);
21558	// AhiBlo = pmuludq(Ahi, b);
21559	//
21560	// Hi = psllqi(AloBhi + AhiBlo, 32);
21561	// return AloBlo + Hi;
21562	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21563	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21564	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21565
21566	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21567	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21568	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21569
21570	// Bit cast to 32-bit vectors for MULUDQ.
21571	SDValue Alo = DAG.getBitcast(MulVT, A);
21572	SDValue Blo = DAG.getBitcast(MulVT, B);
21573
21574	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21575
21576	// Only multiply lo/hi halves that aren't known to be zero.
21577	SDValue AloBlo = Zero;
21578	if (!ALoIsZero && !BLoIsZero)
21579	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21580
21581	SDValue AloBhi = Zero;
21582	if (!ALoIsZero && !BHiIsZero) {
21583	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21584	Bhi = DAG.getBitcast(MulVT, Bhi);
21585	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21586	}
21587
21588	SDValue AhiBlo = Zero;
21589	if (!AHiIsZero && !BLoIsZero) {
21590	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21591	Ahi = DAG.getBitcast(MulVT, Ahi);
21592	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21593	}
21594
21595	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21596	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21597
21598	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21599	}
21600
21601	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21602	SelectionDAG &DAG) {
21603	SDLoc dl(Op);
21604	MVT VT = Op.getSimpleValueType();
21605
21606	// Decompose 256-bit ops into smaller 128-bit ops.
21607	if (VT.is256BitVector() && !Subtarget.hasInt256())
21608	return Lower256IntArith(Op, DAG);
21609
21610	// Only i8 vectors should need custom lowering after this.
21611	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256())) &&(((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget .hasInt256())) && "Unsupported vector type") ? static_cast <void> (0) : __assert_fail ("(VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256())) && \"Unsupported vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21612, __PRETTY_FUNCTION__))
21612	"Unsupported vector type")(((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget .hasInt256())) && "Unsupported vector type") ? static_cast <void> (0) : __assert_fail ("(VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256())) && \"Unsupported vector type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21612, __PRETTY_FUNCTION__));
21613
21614	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21615	// logical shift down the upper half and pack back to i8.
21616	SDValue A = Op.getOperand(0);
21617	SDValue B = Op.getOperand(1);
21618
21619	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21620	// and then ashr/lshr the upper bits down to the lower bits before multiply.
21621	unsigned Opcode = Op.getOpcode();
21622	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21623	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21624
21625	// AVX2 implementations - extend xmm subvectors to ymm.
21626	if (Subtarget.hasInt256()) {
21627	SDValue Lo = DAG.getIntPtrConstant(0, dl);
21628	SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21629
21630	if (VT == MVT::v32i8) {
21631	SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21632	SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21633	SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21634	SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21635	ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21636	BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21637	AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21638	BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21639	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21640	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21641	DAG.getConstant(8, dl, MVT::v16i16));
21642	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21643	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21644	DAG.getConstant(8, dl, MVT::v16i16));
21645	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21646	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21647	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21648	16, 17, 18, 19, 20, 21, 22, 23};
21649	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21650	24, 25, 26, 27, 28, 29, 30, 31};
21651	return DAG.getNode(X86ISD::PACKUS, dl, VT,
21652	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21653	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21654	}
21655
21656	SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21657	SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21658	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21659	SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21660	DAG.getConstant(8, dl, MVT::v16i16));
21661	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21662	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21663	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21664	}
21665
21666	assert(VT == MVT::v16i8 &&((VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21667, __PRETTY_FUNCTION__))
21667	"Pre-AVX2 support only supports v16i8 multiplication")((VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21667, __PRETTY_FUNCTION__));
21668	MVT ExVT = MVT::v8i16;
21669
21670	// Extract the lo parts and zero/sign extend to i16.
21671	SDValue ALo, BLo;
21672	if (Subtarget.hasSSE41()) {
21673	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21674	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21675	} else {
21676	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21677	-1, 4, -1, 5, -1, 6, -1, 7};
21678	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21679	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21680	ALo = DAG.getBitcast(ExVT, ALo);
21681	BLo = DAG.getBitcast(ExVT, BLo);
21682	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21683	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21684	}
21685
21686	// Extract the hi parts and zero/sign extend to i16.
21687	SDValue AHi, BHi;
21688	if (Subtarget.hasSSE41()) {
21689	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21690	-1, -1, -1, -1, -1, -1, -1, -1};
21691	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21692	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21693	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21694	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21695	} else {
21696	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21697	-1, 12, -1, 13, -1, 14, -1, 15};
21698	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21699	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21700	AHi = DAG.getBitcast(ExVT, AHi);
21701	BHi = DAG.getBitcast(ExVT, BHi);
21702	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21703	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21704	}
21705
21706	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21707	// pack back to v16i8.
21708	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21709	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21710	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21711	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21712	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21713	}
21714
21715	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21716	assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21716, __PRETTY_FUNCTION__));
21717	EVT VT = Op.getValueType();
21718	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering") ? static_cast<void > (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21719, __PRETTY_FUNCTION__))
21719	"Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering") ? static_cast<void > (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21719, __PRETTY_FUNCTION__));
21720
21721	RTLIB::Libcall LC;
21722	bool isSigned;
21723	switch (Op->getOpcode()) {
21724	default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21724);
21725	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21726	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21727	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21728	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21729	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21730	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21731	}
21732
21733	SDLoc dl(Op);
21734	SDValue InChain = DAG.getEntryNode();
21735
21736	TargetLowering::ArgListTy Args;
21737	TargetLowering::ArgListEntry Entry;
21738	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21739	EVT ArgVT = Op->getOperand(i).getValueType();
21740	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering") ? static_cast<void > (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21741, __PRETTY_FUNCTION__))
21741	"Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering") ? static_cast<void > (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21741, __PRETTY_FUNCTION__));
21742	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21743	Entry.Node = StackPtr;
21744	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21745	MachinePointerInfo(), /* Alignment = */ 16);
21746	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
21747	Entry.Ty = PointerType::get(ArgTy,0);
21748	Entry.IsSExt = false;
21749	Entry.IsZExt = false;
21750	Args.push_back(Entry);
21751	}
21752
21753	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21754	getPointerTy(DAG.getDataLayout()));
21755
21756	TargetLowering::CallLoweringInfo CLI(DAG);
21757	CLI.setDebugLoc(dl)
21758	.setChain(InChain)
21759	.setLibCallee(
21760	getLibcallCallingConv(LC),
21761	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21762	std::move(Args))
21763	.setInRegister()
21764	.setSExtResult(isSigned)
21765	.setZExtResult(!isSigned);
21766
21767	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21768	return DAG.getBitcast(VT, CallInfo.first);
21769	}
21770
21771	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21772	SelectionDAG &DAG) {
21773	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21774	MVT VT = Op0.getSimpleValueType();
21775	SDLoc dl(Op);
21776
21777	// Decompose 256-bit ops into smaller 128-bit ops.
21778	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21779	unsigned Opcode = Op.getOpcode();
21780	unsigned NumElems = VT.getVectorNumElements();
21781	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21782	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21783	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21784	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21785	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21786	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21787	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21788	SDValue Ops[] = {
21789	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21790	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21791	};
21792	return DAG.getMergeValues(Ops, dl);
21793	}
21794
21795	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|(((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\| (VT == MVT::v8i32 && Subtarget.hasInt256())) ? static_cast< void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\| (VT == MVT::v8i32 && Subtarget.hasInt256())" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21796, __PRETTY_FUNCTION__))
21796	(VT == MVT::v8i32 && Subtarget.hasInt256()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\| (VT == MVT::v8i32 && Subtarget.hasInt256())) ? static_cast< void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\| (VT == MVT::v8i32 && Subtarget.hasInt256())" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21796, __PRETTY_FUNCTION__));
21797
21798	// PMULxD operations multiply each even value (starting at 0) of LHS with
21799	// the related value of RHS and produce a widen result.
21800	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
21801	// => <2 x i64> <ae\|cg>
21802	//
21803	// In other word, to have all the results, we need to perform two PMULxD:
21804	// 1. one with the even values.
21805	// 2. one with the odd values.
21806	// To achieve #2, with need to place the odd values at an even position.
21807	//
21808	// Place the odd value at an even position (basically, shift all values 1
21809	// step to the left):
21810	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21811	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
21812	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21813	makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21814	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
21815	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21816	makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21817
21818	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
21819	// ints.
21820	MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21821	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21822	unsigned Opcode =
21823	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21824	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
21825	// => <2 x i64> <ae\|cg>
21826	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21827	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
21828	// => <2 x i64> <bf\|dh>
21829	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21830
21831	// Shuffle it back into the right order.
21832	SDValue Highs, Lows;
21833	if (VT == MVT::v8i32) {
21834	const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21835	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21836	const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21837	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21838	} else {
21839	const int HighMask[] = {1, 5, 3, 7};
21840	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21841	const int LowMask[] = {0, 4, 2, 6};
21842	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21843	}
21844
21845	// If we have a signed multiply but no PMULDQ fix up the high parts of a
21846	// unsigned multiply.
21847	if (IsSigned && !Subtarget.hasSSE41()) {
21848	SDValue ShAmt = DAG.getConstant(
21849	31, dl,
21850	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21851	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21852	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21853	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21854	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21855
21856	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21857	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21858	}
21859
21860	// The first result of MUL_LOHI is actually the low value, followed by the
21861	// high value.
21862	SDValue Ops[] = {Lows, Highs};
21863	return DAG.getMergeValues(Ops, dl);
21864	}
21865
21866	// Return true if the required (according to Opcode) shift-imm form is natively
21867	// supported by the Subtarget
21868	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21869	unsigned Opcode) {
21870	if (VT.getScalarSizeInBits() < 16)
21871	return false;
21872
21873	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21874	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
21875	return true;
21876
21877	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
21878	(VT.is256BitVector() && Subtarget.hasInt256());
21879
21880	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
21881	(VT != MVT::v2i64 && VT != MVT::v4i64));
21882	return (Opcode == ISD::SRA) ? AShift : LShift;
21883	}
21884
21885	// The shift amount is a variable, but it is the same for all vector lanes.
21886	// These instructions are defined together with shift-immediate.
21887	static
21888	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21889	unsigned Opcode) {
21890	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21891	}
21892
21893	// Return true if the required (according to Opcode) variable-shift form is
21894	// natively supported by the Subtarget
21895	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21896	unsigned Opcode) {
21897
21898	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
21899	return false;
21900
21901	// vXi16 supported only on AVX-512, BWI
21902	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21903	return false;
21904
21905	if (Subtarget.hasAVX512())
21906	return true;
21907
21908	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
21909	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21910	return (Opcode == ISD::SRA) ? AShift : LShift;
21911	}
21912
21913	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21914	const X86Subtarget &Subtarget) {
21915	MVT VT = Op.getSimpleValueType();
21916	SDLoc dl(Op);
21917	SDValue R = Op.getOperand(0);
21918	SDValue Amt = Op.getOperand(1);
21919
21920	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21921	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21922
21923	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21924	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type" ) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 \|\| VT == MVT::v4i64) && \"Unexpected SRA type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21924, __PRETTY_FUNCTION__));
21925	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21926	SDValue Ex = DAG.getBitcast(ExVT, R);
21927
21928	// ashr(R, 63) === cmp_slt(R, 0)
21929	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21930	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&(((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) && "Unsupported PCMPGT op" ) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 \|\| Subtarget.hasInt256()) && \"Unsupported PCMPGT op\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21931, __PRETTY_FUNCTION__))
21931	"Unsupported PCMPGT op")(((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) && "Unsupported PCMPGT op" ) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 \|\| Subtarget.hasInt256()) && \"Unsupported PCMPGT op\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21931, __PRETTY_FUNCTION__));
21932	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21933	getZeroVector(VT, Subtarget, DAG, dl), R);
21934	}
21935
21936	if (ShiftAmt >= 32) {
21937	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21938	SDValue Upper =
21939	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21940	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21941	ShiftAmt - 32, DAG);
21942	if (VT == MVT::v2i64)
21943	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21944	if (VT == MVT::v4i64)
21945	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21946	{9, 1, 11, 3, 13, 5, 15, 7});
21947	} else {
21948	// SRA upper i32, SHL whole i64 and select lower i32.
21949	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21950	ShiftAmt, DAG);
21951	SDValue Lower =
21952	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21953	Lower = DAG.getBitcast(ExVT, Lower);
21954	if (VT == MVT::v2i64)
21955	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21956	if (VT == MVT::v4i64)
21957	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21958	{8, 1, 10, 3, 12, 5, 14, 7});
21959	}
21960	return DAG.getBitcast(VT, Ex);
21961	};
21962
21963	// Optimize shl/srl/sra with constant shift amount.
21964	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21965	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21966	uint64_t ShiftAmt = ShiftConst->getZExtValue();
21967
21968	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21969	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21970
21971	// i64 SRA needs to be performed as partial shifts.
21972	if ((VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21973	Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21974	return ArithmeticShiftRight64(ShiftAmt);
21975
21976	if (VT == MVT::v16i8 \|\|
21977	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
21978	VT == MVT::v64i8) {
21979	unsigned NumElts = VT.getVectorNumElements();
21980	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21981
21982	// Simple i8 add case
21983	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21984	return DAG.getNode(ISD::ADD, dl, VT, R, R);
21985
21986	// ashr(R, 7) === cmp_slt(R, 0)
21987	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21988	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21989	if (VT.is512BitVector()) {
21990	assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 21990, __PRETTY_FUNCTION__));
21991	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21992	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21993	}
21994	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21995	}
21996
21997	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21998	if (VT == MVT::v16i8 && Subtarget.hasXOP())
21999	return SDValue();
22000
22001	if (Op.getOpcode() == ISD::SHL) {
22002	// Make a large shift.
22003	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22004	R, ShiftAmt, DAG);
22005	SHL = DAG.getBitcast(VT, SHL);
22006	// Zero out the rightmost bits.
22007	return DAG.getNode(ISD::AND, dl, VT, SHL,
22008	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22009	}
22010	if (Op.getOpcode() == ISD::SRL) {
22011	// Make a large shift.
22012	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22013	R, ShiftAmt, DAG);
22014	SRL = DAG.getBitcast(VT, SRL);
22015	// Zero out the leftmost bits.
22016	return DAG.getNode(ISD::AND, dl, VT, SRL,
22017	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22018	}
22019	if (Op.getOpcode() == ISD::SRA) {
22020	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22021	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22022
22023	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22024	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22025	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22026	return Res;
22027	}
22028	llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22028);
22029	}
22030	}
22031	}
22032
22033	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
22034	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22035	if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22036	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
22037	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22038
22039	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22040	unsigned SubVectorScale = 1;
22041	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22042	SubVectorScale =
22043	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22044	Amt = Amt.getOperand(0);
22045	}
22046
22047	// Peek through any splat that was introduced for i64 shift vectorization.
22048	int SplatIndex = -1;
22049	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22050	if (SVN->isSplat()) {
22051	SplatIndex = SVN->getSplatIndex();
22052	Amt = Amt.getOperand(0);
22053	assert(SplatIndex < (int)VT.getVectorNumElements() &&((SplatIndex < (int)VT.getVectorNumElements() && "Splat shuffle referencing second operand" ) ? static_cast<void> (0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22054, __PRETTY_FUNCTION__))
22054	"Splat shuffle referencing second operand")((SplatIndex < (int)VT.getVectorNumElements() && "Splat shuffle referencing second operand" ) ? static_cast<void> (0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22054, __PRETTY_FUNCTION__));
22055	}
22056
22057	if (Amt.getOpcode() != ISD::BITCAST \|\|
22058	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22059	return SDValue();
22060
22061	Amt = Amt.getOperand(0);
22062	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22063	(SubVectorScale * VT.getVectorNumElements());
22064	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22065	uint64_t ShiftAmt = 0;
22066	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22067	for (unsigned i = 0; i != Ratio; ++i) {
22068	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22069	if (!C)
22070	return SDValue();
22071	// 6 == Log2(64)
22072	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22073	}
22074
22075	// Check remaining shift amounts (if not a splat).
22076	if (SplatIndex < 0) {
22077	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22078	uint64_t ShAmt = 0;
22079	for (unsigned j = 0; j != Ratio; ++j) {
22080	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22081	if (!C)
22082	return SDValue();
22083	// 6 == Log2(64)
22084	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22085	}
22086	if (ShAmt != ShiftAmt)
22087	return SDValue();
22088	}
22089	}
22090
22091	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22092	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22093
22094	if (Op.getOpcode() == ISD::SRA)
22095	return ArithmeticShiftRight64(ShiftAmt);
22096	}
22097
22098	return SDValue();
22099	}
22100
22101	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22102	const X86Subtarget &Subtarget) {
22103	MVT VT = Op.getSimpleValueType();
22104	SDLoc dl(Op);
22105	SDValue R = Op.getOperand(0);
22106	SDValue Amt = Op.getOperand(1);
22107
22108	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22109	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22110
22111	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22112	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22113
22114	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22115	SDValue BaseShAmt;
22116	MVT EltVT = VT.getVectorElementType();
22117
22118	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22119	// Check if this build_vector node is doing a splat.
22120	// If so, then set BaseShAmt equal to the splat value.
22121	BaseShAmt = BV->getSplatValue();
22122	if (BaseShAmt && BaseShAmt.isUndef())
22123	BaseShAmt = SDValue();
22124	} else {
22125	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22126	Amt = Amt.getOperand(0);
22127
22128	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22129	if (SVN && SVN->isSplat()) {
22130	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22131	SDValue InVec = Amt.getOperand(0);
22132	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22133	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&(((SplatIdx < InVec.getSimpleValueType().getVectorNumElements ()) && "Unexpected shuffle index found!") ? static_cast <void> (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22134, __PRETTY_FUNCTION__))
22134	"Unexpected shuffle index found!")(((SplatIdx < InVec.getSimpleValueType().getVectorNumElements ()) && "Unexpected shuffle index found!") ? static_cast <void> (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22134, __PRETTY_FUNCTION__));
22135	BaseShAmt = InVec.getOperand(SplatIdx);
22136	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22137	if (ConstantSDNode *C =
22138	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22139	if (C->getZExtValue() == SplatIdx)
22140	BaseShAmt = InVec.getOperand(1);
22141	}
22142	}
22143
22144	if (!BaseShAmt)
22145	// Avoid introducing an extract element from a shuffle.
22146	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22147	DAG.getIntPtrConstant(SplatIdx, dl));
22148	}
22149	}
22150
22151	if (BaseShAmt.getNode()) {
22152	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!" ) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22152, __PRETTY_FUNCTION__));
22153	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22154	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22155	else if (EltVT.bitsLT(MVT::i32))
22156	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22157
22158	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22159	}
22160	}
22161
22162	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
22163	if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22164	Amt.getOpcode() == ISD::BITCAST &&
22165	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22166	Amt = Amt.getOperand(0);
22167	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22168	VT.getVectorNumElements();
22169	std::vector<SDValue> Vals(Ratio);
22170	for (unsigned i = 0; i != Ratio; ++i)
22171	Vals[i] = Amt.getOperand(i);
22172	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22173	for (unsigned j = 0; j != Ratio; ++j)
22174	if (Vals[j] != Amt.getOperand(i + j))
22175	return SDValue();
22176	}
22177
22178	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22179	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22180	}
22181	return SDValue();
22182	}
22183
22184	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22185	SelectionDAG &DAG) {
22186	MVT VT = Op.getSimpleValueType();
22187	SDLoc dl(Op);
22188	SDValue R = Op.getOperand(0);
22189	SDValue Amt = Op.getOperand(1);
22190	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22191
22192	assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!" ) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22192, __PRETTY_FUNCTION__));
22193	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22193, __PRETTY_FUNCTION__));
22194
22195	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22196	return V;
22197
22198	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22199	return V;
22200
22201	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22202	return Op;
22203
22204	// XOP has 128-bit variable logical/arithmetic shifts.
22205	// +ve/-ve Amt = shift left/right.
22206	if (Subtarget.hasXOP() &&
22207	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
22208	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
22209	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
22210	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22211	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22212	}
22213	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
22214	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22215	if (Op.getOpcode() == ISD::SRA)
22216	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22217	}
22218
22219	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
22220	// shifts per-lane and then shuffle the partial results back together.
22221	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22222	// Splat the shift amounts so the scalar shifts above will catch it.
22223	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22224	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22225	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22226	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22227	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22228	}
22229
22230	// i64 vector arithmetic shift can be emulated with the transform:
22231	// M = lshr(SIGN_MASK, Amt)
22232	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22233	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22234	Op.getOpcode() == ISD::SRA) {
22235	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22236	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22237	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22238	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22239	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22240	return R;
22241	}
22242
22243	// If possible, lower this packed shift into a vector multiply instead of
22244	// expanding it into a sequence of scalar shifts.
22245	// Do this only if the vector shift count is a constant build_vector.
22246	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22247	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
22248	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
22249	SmallVector<SDValue, 8> Elts;
22250	MVT SVT = VT.getVectorElementType();
22251	unsigned SVTBits = SVT.getSizeInBits();
22252	APInt One(SVTBits, 1);
22253	unsigned NumElems = VT.getVectorNumElements();
22254
22255	for (unsigned i=0; i !=NumElems; ++i) {
22256	SDValue Op = Amt->getOperand(i);
22257	if (Op->isUndef()) {
22258	Elts.push_back(Op);
22259	continue;
22260	}
22261
22262	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22263	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22264	uint64_t ShAmt = C.getZExtValue();
22265	if (ShAmt >= SVTBits) {
22266	Elts.push_back(DAG.getUNDEF(SVT));
22267	continue;
22268	}
22269	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22270	}
22271	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22272	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22273	}
22274
22275	// Lower SHL with variable shift amount.
22276	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22277	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22278
22279	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22280	DAG.getConstant(0x3f800000U, dl, VT));
22281	Op = DAG.getBitcast(MVT::v4f32, Op);
22282	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22283	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22284	}
22285
22286	// If possible, lower this shift as a sequence of two shifts by
22287	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22288	// Example:
22289	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22290	//
22291	// Could be rewritten as:
22292	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22293	//
22294	// The advantage is that the two shifts from the example would be
22295	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22296	// the vector shift into four scalar shifts plus four pairs of vector
22297	// insert/extract.
22298	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
22299	unsigned TargetOpcode = X86ISD::MOVSS;
22300	bool CanBeSimplified;
22301	// The splat value for the first packed shift (the 'X' from the example).
22302	SDValue Amt1 = Amt->getOperand(0);
22303	// The splat value for the second packed shift (the 'Y' from the example).
22304	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22305
22306	// See if it is possible to replace this node with a sequence of
22307	// two shifts followed by a MOVSS/MOVSD/PBLEND.
22308	if (VT == MVT::v4i32) {
22309	// Check if it is legal to use a MOVSS.
22310	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22311	Amt2 == Amt->getOperand(3);
22312	if (!CanBeSimplified) {
22313	// Otherwise, check if we can still simplify this node using a MOVSD.
22314	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22315	Amt->getOperand(2) == Amt->getOperand(3);
22316	TargetOpcode = X86ISD::MOVSD;
22317	Amt2 = Amt->getOperand(2);
22318	}
22319	} else {
22320	// Do similar checks for the case where the machine value type
22321	// is MVT::v8i16.
22322	CanBeSimplified = Amt1 == Amt->getOperand(1);
22323	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22324	CanBeSimplified = Amt2 == Amt->getOperand(i);
22325
22326	if (!CanBeSimplified) {
22327	TargetOpcode = X86ISD::MOVSD;
22328	CanBeSimplified = true;
22329	Amt2 = Amt->getOperand(4);
22330	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22331	CanBeSimplified = Amt1 == Amt->getOperand(i);
22332	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22333	CanBeSimplified = Amt2 == Amt->getOperand(j);
22334	}
22335	}
22336
22337	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22338	isa<ConstantSDNode>(Amt2)) {
22339	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22340	MVT CastVT = MVT::v4i32;
22341	SDValue Splat1 =
22342	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22343	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22344	SDValue Splat2 =
22345	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22346	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22347	SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22348	SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22349	if (TargetOpcode == X86ISD::MOVSD)
22350	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22351	BitCast2, {0, 1, 6, 7}));
22352	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22353	BitCast2, {0, 5, 6, 7}));
22354	}
22355	}
22356
22357	// v4i32 Non Uniform Shifts.
22358	// If the shift amount is constant we can shift each lane using the SSE2
22359	// immediate shifts, else we need to zero-extend each lane to the lower i64
22360	// and shift using the SSE2 variable shifts.
22361	// The separate results can then be blended together.
22362	if (VT == MVT::v4i32) {
22363	unsigned Opc = Op.getOpcode();
22364	SDValue Amt0, Amt1, Amt2, Amt3;
22365	if (ConstantAmt) {
22366	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22367	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22368	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22369	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22370	} else {
22371	// ISD::SHL is handled above but we include it here for completeness.
22372	switch (Opc) {
22373	default:
22374	llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22374);
22375	case ISD::SHL:
22376	Opc = X86ISD::VSHL;
22377	break;
22378	case ISD::SRL:
22379	Opc = X86ISD::VSRL;
22380	break;
22381	case ISD::SRA:
22382	Opc = X86ISD::VSRA;
22383	break;
22384	}
22385	// The SSE2 shifts use the lower i64 as the same shift amount for
22386	// all lanes and the upper i64 is ignored. These shuffle masks
22387	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22388	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22389	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22390	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22391	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22392	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22393	}
22394
22395	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22396	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22397	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22398	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22399	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22400	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22401	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22402	}
22403
22404	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
22405	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22406	// make the existing SSE solution better.
22407	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
22408	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
22409	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
22410	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
22411	MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22412	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22413	unsigned ExtOpc =
22414	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22415	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22416	Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22417	return DAG.getNode(ISD::TRUNCATE, dl, VT,
22418	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22419	}
22420
22421	if (VT == MVT::v16i8 \|\|
22422	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
22423	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
22424	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22425	unsigned ShiftOpcode = Op->getOpcode();
22426
22427	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22428	if (VT.is512BitVector()) {
22429	// On AVX512BW targets we make use of the fact that VSELECT lowers
22430	// to a masked blend which selects bytes based just on the sign bit
22431	// extracted to a mask.
22432	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22433	V0 = DAG.getBitcast(VT, V0);
22434	V1 = DAG.getBitcast(VT, V1);
22435	Sel = DAG.getBitcast(VT, Sel);
22436	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22437	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22438	} else if (Subtarget.hasSSE41()) {
22439	// On SSE41 targets we make use of the fact that VSELECT lowers
22440	// to PBLENDVB which selects bytes based just on the sign bit.
22441	V0 = DAG.getBitcast(VT, V0);
22442	V1 = DAG.getBitcast(VT, V1);
22443	Sel = DAG.getBitcast(VT, Sel);
22444	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22445	}
22446	// On pre-SSE41 targets we test for the sign bit by comparing to
22447	// zero - a negative value will set all bits of the lanes to true
22448	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22449	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22450	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22451	return DAG.getSelect(dl, SelVT, C, V0, V1);
22452	};
22453
22454	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22455	// We can safely do this using i16 shifts as we're only interested in
22456	// the 3 lower bits of each byte.
22457	Amt = DAG.getBitcast(ExtVT, Amt);
22458	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22459	Amt = DAG.getBitcast(VT, Amt);
22460
22461	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
22462	// r = VSELECT(r, shift(r, 4), a);
22463	SDValue M =
22464	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22465	R = SignBitSelect(VT, Amt, M, R);
22466
22467	// a += a
22468	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22469
22470	// r = VSELECT(r, shift(r, 2), a);
22471	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22472	R = SignBitSelect(VT, Amt, M, R);
22473
22474	// a += a
22475	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22476
22477	// return VSELECT(r, shift(r, 1), a);
22478	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22479	R = SignBitSelect(VT, Amt, M, R);
22480	return R;
22481	}
22482
22483	if (Op->getOpcode() == ISD::SRA) {
22484	// For SRA we need to unpack each byte to the higher byte of a i16 vector
22485	// so we can correctly sign extend. We don't care what happens to the
22486	// lower byte.
22487	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22488	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22489	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22490	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22491	ALo = DAG.getBitcast(ExtVT, ALo);
22492	AHi = DAG.getBitcast(ExtVT, AHi);
22493	RLo = DAG.getBitcast(ExtVT, RLo);
22494	RHi = DAG.getBitcast(ExtVT, RHi);
22495
22496	// r = VSELECT(r, shift(r, 4), a);
22497	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22498	DAG.getConstant(4, dl, ExtVT));
22499	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22500	DAG.getConstant(4, dl, ExtVT));
22501	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22502	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22503
22504	// a += a
22505	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22506	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22507
22508	// r = VSELECT(r, shift(r, 2), a);
22509	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22510	DAG.getConstant(2, dl, ExtVT));
22511	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22512	DAG.getConstant(2, dl, ExtVT));
22513	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22514	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22515
22516	// a += a
22517	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22518	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22519
22520	// r = VSELECT(r, shift(r, 1), a);
22521	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22522	DAG.getConstant(1, dl, ExtVT));
22523	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22524	DAG.getConstant(1, dl, ExtVT));
22525	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22526	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22527
22528	// Logical shift the result back to the lower byte, leaving a zero upper
22529	// byte
22530	// meaning that we can safely pack with PACKUSWB.
22531	RLo =
22532	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22533	RHi =
22534	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22535	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22536	}
22537	}
22538
22539	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22540	MVT ExtVT = MVT::v8i32;
22541	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22542	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22543	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22544	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22545	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22546	ALo = DAG.getBitcast(ExtVT, ALo);
22547	AHi = DAG.getBitcast(ExtVT, AHi);
22548	RLo = DAG.getBitcast(ExtVT, RLo);
22549	RHi = DAG.getBitcast(ExtVT, RHi);
22550	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22551	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22552	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22553	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22554	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22555	}
22556
22557	if (VT == MVT::v8i16) {
22558	unsigned ShiftOpcode = Op->getOpcode();
22559
22560	// If we have a constant shift amount, the non-SSE41 path is best as
22561	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22562	bool UseSSE41 = Subtarget.hasSSE41() &&
22563	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22564
22565	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22566	// On SSE41 targets we make use of the fact that VSELECT lowers
22567	// to PBLENDVB which selects bytes based just on the sign bit.
22568	if (UseSSE41) {
22569	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22570	V0 = DAG.getBitcast(ExtVT, V0);
22571	V1 = DAG.getBitcast(ExtVT, V1);
22572	Sel = DAG.getBitcast(ExtVT, Sel);
22573	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22574	}
22575	// On pre-SSE41 targets we splat the sign bit - a negative value will
22576	// set all bits of the lanes to true and VSELECT uses that in
22577	// its OR(AND(V0,C),AND(V1,~C)) lowering.
22578	SDValue C =
22579	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22580	return DAG.getSelect(dl, VT, C, V0, V1);
22581	};
22582
22583	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22584	if (UseSSE41) {
22585	// On SSE41 targets we need to replicate the shift mask in both
22586	// bytes for PBLENDVB.
22587	Amt = DAG.getNode(
22588	ISD::OR, dl, VT,
22589	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22590	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22591	} else {
22592	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22593	}
22594
22595	// r = VSELECT(r, shift(r, 8), a);
22596	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22597	R = SignBitSelect(Amt, M, R);
22598
22599	// a += a
22600	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22601
22602	// r = VSELECT(r, shift(r, 4), a);
22603	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22604	R = SignBitSelect(Amt, M, R);
22605
22606	// a += a
22607	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22608
22609	// r = VSELECT(r, shift(r, 2), a);
22610	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22611	R = SignBitSelect(Amt, M, R);
22612
22613	// a += a
22614	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22615
22616	// return VSELECT(r, shift(r, 1), a);
22617	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22618	R = SignBitSelect(Amt, M, R);
22619	return R;
22620	}
22621
22622	// Decompose 256-bit shifts into smaller 128-bit shifts.
22623	if (VT.is256BitVector())
22624	return Lower256IntArith(Op, DAG);
22625
22626	return SDValue();
22627	}
22628
22629	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22630	SelectionDAG &DAG) {
22631	MVT VT = Op.getSimpleValueType();
22632	SDLoc DL(Op);
22633	SDValue R = Op.getOperand(0);
22634	SDValue Amt = Op.getOperand(1);
22635
22636	assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!" ) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22636, __PRETTY_FUNCTION__));
22637	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!")((Subtarget.hasXOP() && "XOP support required for vector rotates!" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasXOP() && \"XOP support required for vector rotates!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22637, __PRETTY_FUNCTION__));
22638	assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported")(((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported" ) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::ROTL) && \"Only ROTL supported\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22638, __PRETTY_FUNCTION__));
22639
22640	// XOP has 128-bit vector variable + immediate rotates.
22641	// +ve/-ve Amt = rotate left/right.
22642
22643	// Split 256-bit integers.
22644	if (VT.is256BitVector())
22645	return Lower256IntArith(Op, DAG);
22646
22647	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22647, __PRETTY_FUNCTION__));
22648
22649	// Attempt to rotate by immediate.
22650	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22651	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22652	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22653	assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range")((RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range" ) ? static_cast<void> (0) : __assert_fail ("RotateAmt < VT.getScalarSizeInBits() && \"Rotation out of range\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22653, __PRETTY_FUNCTION__));
22654	return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22655	DAG.getConstant(RotateAmt, DL, MVT::i8));
22656	}
22657	}
22658
22659	// Use general rotate by variable (per-element).
22660	return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22661	}
22662
22663	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22664	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22665	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22666	// looks for this combo and may remove the "setcc" instruction if the "setcc"
22667	// has only one use.
22668	SDNode *N = Op.getNode();
22669	SDValue LHS = N->getOperand(0);
22670	SDValue RHS = N->getOperand(1);
22671	unsigned BaseOp = 0;
22672	X86::CondCode Cond;
22673	SDLoc DL(Op);
22674	switch (Op.getOpcode()) {
22675	default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22675);
22676	case ISD::SADDO:
22677	// A subtract of one will be selected as a INC. Note that INC doesn't
22678	// set CF, so we can't do this for UADDO.
22679	if (isOneConstant(RHS)) {
22680	BaseOp = X86ISD::INC;
22681	Cond = X86::COND_O;
22682	break;
22683	}
22684	BaseOp = X86ISD::ADD;
22685	Cond = X86::COND_O;
22686	break;
22687	case ISD::UADDO:
22688	BaseOp = X86ISD::ADD;
22689	Cond = X86::COND_B;
22690	break;
22691	case ISD::SSUBO:
22692	// A subtract of one will be selected as a DEC. Note that DEC doesn't
22693	// set CF, so we can't do this for USUBO.
22694	if (isOneConstant(RHS)) {
22695	BaseOp = X86ISD::DEC;
22696	Cond = X86::COND_O;
22697	break;
22698	}
22699	BaseOp = X86ISD::SUB;
22700	Cond = X86::COND_O;
22701	break;
22702	case ISD::USUBO:
22703	BaseOp = X86ISD::SUB;
22704	Cond = X86::COND_B;
22705	break;
22706	case ISD::SMULO:
22707	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22708	Cond = X86::COND_O;
22709	break;
22710	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22711	if (N->getValueType(0) == MVT::i8) {
22712	BaseOp = X86ISD::UMUL8;
22713	Cond = X86::COND_O;
22714	break;
22715	}
22716	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22717	MVT::i32);
22718	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22719
22720	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22721
22722	if (N->getValueType(1) == MVT::i1)
22723	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22724
22725	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22726	}
22727	}
22728
22729	// Also sets EFLAGS.
22730	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22731	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22732
22733	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22734
22735	if (N->getValueType(1) == MVT::i1)
22736	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22737
22738	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22739	}
22740
22741	/// Returns true if the operand type is exactly twice the native width, and
22742	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22743	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22744	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22745	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22746	unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22747
22748	if (OpWidth == 64)
22749	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22750	else if (OpWidth == 128)
22751	return Subtarget.hasCmpxchg16b();
22752	else
22753	return false;
22754	}
22755
22756	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22757	return needsCmpXchgNb(SI->getValueOperand()->getType());
22758	}
22759
22760	// Note: this turns large loads into lock cmpxchg8b/16b.
22761	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22762	TargetLowering::AtomicExpansionKind
22763	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22764	auto PTy = cast<PointerType>(LI->getPointerOperandType());
22765	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22766	: AtomicExpansionKind::None;
22767	}
22768
22769	TargetLowering::AtomicExpansionKind
22770	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22771	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22772	Type *MemType = AI->getType();
22773
22774	// If the operand is too big, we must see if cmpxchg8/16b is available
22775	// and default to library calls otherwise.
22776	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22777	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22778	: AtomicExpansionKind::None;
22779	}
22780
22781	AtomicRMWInst::BinOp Op = AI->getOperation();
22782	switch (Op) {
22783	default:
22784	llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22784);
22785	case AtomicRMWInst::Xchg:
22786	case AtomicRMWInst::Add:
22787	case AtomicRMWInst::Sub:
22788	// It's better to use xadd, xsub or xchg for these in all cases.
22789	return AtomicExpansionKind::None;
22790	case AtomicRMWInst::Or:
22791	case AtomicRMWInst::And:
22792	case AtomicRMWInst::Xor:
22793	// If the atomicrmw's result isn't actually used, we can just add a "lock"
22794	// prefix to a normal instruction for these operations.
22795	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22796	: AtomicExpansionKind::None;
22797	case AtomicRMWInst::Nand:
22798	case AtomicRMWInst::Max:
22799	case AtomicRMWInst::Min:
22800	case AtomicRMWInst::UMax:
22801	case AtomicRMWInst::UMin:
22802	// These always require a non-trivial set of data operations on x86. We must
22803	// use a cmpxchg loop.
22804	return AtomicExpansionKind::CmpXChg;
22805	}
22806	}
22807
22808	LoadInst *
22809	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22810	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22811	Type *MemType = AI->getType();
22812	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
22813	// there is no benefit in turning such RMWs into loads, and it is actually
22814	// harmful as it introduces a mfence.
22815	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22816	return nullptr;
22817
22818	auto Builder = IRBuilder<>(AI);
22819	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22820	auto SynchScope = AI->getSynchScope();
22821	// We must restrict the ordering to avoid generating loads with Release or
22822	// ReleaseAcquire orderings.
22823	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22824	auto Ptr = AI->getPointerOperand();
22825
22826	// Before the load we need a fence. Here is an example lifted from
22827	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22828	// is required:
22829	// Thread 0:
22830	// x.store(1, relaxed);
22831	// r1 = y.fetch_add(0, release);
22832	// Thread 1:
22833	// y.fetch_add(42, acquire);
22834	// r2 = x.load(relaxed);
22835	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22836	// lowered to just a load without a fence. A mfence flushes the store buffer,
22837	// making the optimization clearly correct.
22838	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22839	// otherwise, we might be able to be more aggressive on relaxed idempotent
22840	// rmw. In practice, they do not look useful, so we don't try to be
22841	// especially clever.
22842	if (SynchScope == SingleThread)
22843	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22844	// the IR level, so we must wrap it in an intrinsic.
22845	return nullptr;
22846
22847	if (!Subtarget.hasMFence())
22848	// FIXME: it might make sense to use a locked operation here but on a
22849	// different cache-line to prevent cache-line bouncing. In practice it
22850	// is probably a small win, and x86 processors without mfence are rare
22851	// enough that we do not bother.
22852	return nullptr;
22853
22854	Function *MFence =
22855	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22856	Builder.CreateCall(MFence, {});
22857
22858	// Finally we can emit the atomic load.
22859	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22860	AI->getType()->getPrimitiveSizeInBits());
22861	Loaded->setAtomic(Order, SynchScope);
22862	AI->replaceAllUsesWith(Loaded);
22863	AI->eraseFromParent();
22864	return Loaded;
22865	}
22866
22867	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22868	SelectionDAG &DAG) {
22869	SDLoc dl(Op);
22870	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22871	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22872	SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22873	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22874
22875	// The only fence that needs an instruction is a sequentially-consistent
22876	// cross-thread fence.
22877	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22878	FenceScope == CrossThread) {
22879	if (Subtarget.hasMFence())
22880	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22881
22882	SDValue Chain = Op.getOperand(0);
22883	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22884	SDValue Ops[] = {
22885	DAG.getRegister(X86::ESP, MVT::i32), // Base
22886	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22887	DAG.getRegister(0, MVT::i32), // Index
22888	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22889	DAG.getRegister(0, MVT::i32), // Segment.
22890	Zero,
22891	Chain
22892	};
22893	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22894	return SDValue(Res, 0);
22895	}
22896
22897	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
22898	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22899	}
22900
22901	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22902	SelectionDAG &DAG) {
22903	MVT T = Op.getSimpleValueType();
22904	SDLoc DL(Op);
22905	unsigned Reg = 0;
22906	unsigned size = 0;
22907	switch(T.SimpleTy) {
22908	default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22908);
22909	case MVT::i8: Reg = X86::AL; size = 1; break;
22910	case MVT::i16: Reg = X86::AX; size = 2; break;
22911	case MVT::i32: Reg = X86::EAX; size = 4; break;
22912	case MVT::i64:
22913	assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast <void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22913, __PRETTY_FUNCTION__));
22914	Reg = X86::RAX; size = 8;
22915	break;
22916	}
22917	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22918	Op.getOperand(2), SDValue());
22919	SDValue Ops[] = { cpIn.getValue(0),
22920	Op.getOperand(1),
22921	Op.getOperand(3),
22922	DAG.getTargetConstant(size, DL, MVT::i8),
22923	cpIn.getValue(1) };
22924	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22925	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22926	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22927	Ops, T, MMO);
22928
22929	SDValue cpOut =
22930	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22931	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22932	MVT::i32, cpOut.getValue(2));
22933	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22934
22935	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22936	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22937	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22938	return SDValue();
22939	}
22940
22941	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22942	SelectionDAG &DAG) {
22943	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22944	MVT DstVT = Op.getSimpleValueType();
22945
22946	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
22947	SrcVT == MVT::i64) {
22948	assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22948, __PRETTY_FUNCTION__));
22949	if (DstVT != MVT::f64)
22950	// This conversion needs to be expanded.
22951	return SDValue();
22952
22953	SDValue Op0 = Op->getOperand(0);
22954	SmallVector<SDValue, 16> Elts;
22955	SDLoc dl(Op);
22956	unsigned NumElts;
22957	MVT SVT;
22958	if (SrcVT.isVector()) {
22959	NumElts = SrcVT.getVectorNumElements();
22960	SVT = SrcVT.getVectorElementType();
22961
22962	// Widen the vector in input in the case of MVT::v2i32.
22963	// Example: from MVT::v2i32 to MVT::v4i32.
22964	for (unsigned i = 0, e = NumElts; i != e; ++i)
22965	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22966	DAG.getIntPtrConstant(i, dl)));
22967	} else {
22968	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST") ? static_cast<void > (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22969, __PRETTY_FUNCTION__))
22969	"Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST") ? static_cast<void > (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22969, __PRETTY_FUNCTION__));
22970	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22971	DAG.getIntPtrConstant(0, dl)));
22972	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22973	DAG.getIntPtrConstant(1, dl)));
22974	NumElts = 2;
22975	SVT = MVT::i32;
22976	}
22977	// Explicitly mark the extra elements as Undef.
22978	Elts.append(NumElts, DAG.getUNDEF(SVT));
22979
22980	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
22981	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22982	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22983	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22984	DAG.getIntPtrConstant(0, dl));
22985	}
22986
22987	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&((Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST") ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22988, __PRETTY_FUNCTION__))
22988	Subtarget.hasMMX() && "Unexpected custom BITCAST")((Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST") ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22988, __PRETTY_FUNCTION__));
22989	assert((DstVT == MVT::i64 \|\|(((DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits ()==64)) && "Unexpected custom BITCAST") ? static_cast <void> (0) : __assert_fail ("(DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22991, __PRETTY_FUNCTION__))
22990	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&(((DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits ()==64)) && "Unexpected custom BITCAST") ? static_cast <void> (0) : __assert_fail ("(DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22991, __PRETTY_FUNCTION__))
22991	"Unexpected custom BITCAST")(((DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits ()==64)) && "Unexpected custom BITCAST") ? static_cast <void> (0) : __assert_fail ("(DstVT == MVT::i64 \|\| (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 22991, __PRETTY_FUNCTION__));
22992	// i64 <=> MMX conversions are Legal.
22993	if (SrcVT==MVT::i64 && DstVT.isVector())
22994	return Op;
22995	if (DstVT==MVT::i64 && SrcVT.isVector())
22996	return Op;
22997	// MMX <=> MMX conversions are Legal.
22998	if (SrcVT.isVector() && DstVT.isVector())
22999	return Op;
23000	// All other conversions need to be expanded.
23001	return SDValue();
23002	}
23003
23004	/// Compute the horizontal sum of bytes in V for the elements of VT.
23005	///
23006	/// Requires V to be a byte vector and VT to be an integer vector type with
23007	/// wider elements than V's type. The width of the elements of VT determines
23008	/// how many bytes of V are summed horizontally to produce each element of the
23009	/// result.
23010	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23011	const X86Subtarget &Subtarget,
23012	SelectionDAG &DAG) {
23013	SDLoc DL(V);
23014	MVT ByteVecVT = V.getSimpleValueType();
23015	MVT EltVT = VT.getVectorElementType();
23016	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type." ) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23017, __PRETTY_FUNCTION__))
23017	"Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type." ) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23017, __PRETTY_FUNCTION__));
23018	assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!" ) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23019, __PRETTY_FUNCTION__))
23019	"Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!" ) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23019, __PRETTY_FUNCTION__));
23020	unsigned VecSize = VT.getSizeInBits();
23021	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!" ) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23021, __PRETTY_FUNCTION__));
23022
23023	// PSADBW instruction horizontally add all bytes and leave the result in i64
23024	// chunks, thus directly computes the pop count for v2i64 and v4i64.
23025	if (EltVT == MVT::i64) {
23026	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23027	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23028	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23029	return DAG.getBitcast(VT, V);
23030	}
23031
23032	if (EltVT == MVT::i32) {
23033	// We unpack the low half and high half into i32s interleaved with zeros so
23034	// that we can use PSADBW to horizontally sum them. The most useful part of
23035	// this is that it lines up the results of two PSADBW instructions to be
23036	// two v2i64 vectors which concatenated are the 4 population counts. We can
23037	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23038	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23039	SDValue V32 = DAG.getBitcast(VT, V);
23040	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23041	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23042
23043	// Do the horizontal sums into two v2i64s.
23044	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23045	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23046	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23047	DAG.getBitcast(ByteVecVT, Low), Zeros);
23048	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23049	DAG.getBitcast(ByteVecVT, High), Zeros);
23050
23051	// Merge them together.
23052	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23053	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23054	DAG.getBitcast(ShortVecVT, Low),
23055	DAG.getBitcast(ShortVecVT, High));
23056
23057	return DAG.getBitcast(VT, V);
23058	}
23059
23060	// The only element type left is i16.
23061	assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23061, __PRETTY_FUNCTION__));
23062
23063	// To obtain pop count for each i16 element starting from the pop count for
23064	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23065	// right by 8. It is important to shift as i16s as i8 vector shift isn't
23066	// directly supported.
23067	SDValue ShifterV = DAG.getConstant(8, DL, VT);
23068	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23069	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23070	DAG.getBitcast(ByteVecVT, V));
23071	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23072	}
23073
23074	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23075	const X86Subtarget &Subtarget,
23076	SelectionDAG &DAG) {
23077	MVT VT = Op.getSimpleValueType();
23078	MVT EltVT = VT.getVectorElementType();
23079	unsigned VecSize = VT.getSizeInBits();
23080
23081	// Implement a lookup table in register by using an algorithm based on:
23082	// http://wm.ite.pl/articles/sse-popcount.html
23083	//
23084	// The general idea is that every lower byte nibble in the input vector is an
23085	// index into a in-register pre-computed pop count table. We then split up the
23086	// input vector in two new ones: (1) a vector with only the shifted-right
23087	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
23088	// masked out higher ones) for each byte. PSHUFB is used separately with both
23089	// to index the in-register table. Next, both are added and the result is a
23090	// i8 vector where each element contains the pop count for input byte.
23091	//
23092	// To obtain the pop count for elements != i8, we follow up with the same
23093	// approach and use additional tricks as described below.
23094	//
23095	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
23096	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
23097	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
23098	/* c / 2, / d / 3, / e / 3, / f */ 4};
23099
23100	int NumByteElts = VecSize / 8;
23101	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23102	SDValue In = DAG.getBitcast(ByteVecVT, Op);
23103	SmallVector<SDValue, 64> LUTVec;
23104	for (int i = 0; i < NumByteElts; ++i)
23105	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23106	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23107	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23108
23109	// High nibbles
23110	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23111	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23112
23113	// Low nibbles
23114	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23115
23116	// The input vector is used as the shuffle mask that index elements into the
23117	// LUT. After counting low and high nibbles, add the vector to obtain the
23118	// final pop count per i8 element.
23119	SDValue HighPopCnt =
23120	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23121	SDValue LowPopCnt =
23122	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23123	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23124
23125	if (EltVT == MVT::i8)
23126	return PopCnt;
23127
23128	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23129	}
23130
23131	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23132	const X86Subtarget &Subtarget,
23133	SelectionDAG &DAG) {
23134	MVT VT = Op.getSimpleValueType();
23135	assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23136, __PRETTY_FUNCTION__))
23136	"Only 128-bit vector bitmath lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23136, __PRETTY_FUNCTION__));
23137
23138	int VecSize = VT.getSizeInBits();
23139	MVT EltVT = VT.getVectorElementType();
23140	int Len = EltVT.getSizeInBits();
23141
23142	// This is the vectorized version of the "best" algorithm from
23143	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23144	// with a minor tweak to use a series of adds + shifts instead of vector
23145	// multiplications. Implemented for all integer vector types. We only use
23146	// this when we don't have SSSE3 which allows a LUT-based lowering that is
23147	// much faster, even faster than using native popcnt instructions.
23148
23149	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23150	MVT VT = V.getSimpleValueType();
23151	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23152	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23153	};
23154	auto GetMask = [&](SDValue V, APInt Mask) {
23155	MVT VT = V.getSimpleValueType();
23156	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23157	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23158	};
23159
23160	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
23161	// x86, so set the SRL type to have elements at least i16 wide. This is
23162	// correct because all of our SRLs are followed immediately by a mask anyways
23163	// that handles any bits that sneak into the high bits of the byte elements.
23164	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23165
23166	SDValue V = Op;
23167
23168	// v = v - ((v >> 1) & 0x55555555...)
23169	SDValue Srl =
23170	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23171	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23172	V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23173
23174	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23175	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23176	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23177	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23178	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23179
23180	// v = (v + (v >> 4)) & 0x0F0F0F0F...
23181	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23182	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23183	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23184
23185	// At this point, V contains the byte-wise population count, and we are
23186	// merely doing a horizontal sum if necessary to get the wider element
23187	// counts.
23188	if (EltVT == MVT::i8)
23189	return V;
23190
23191	return LowerHorizontalByteSum(
23192	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23193	DAG);
23194	}
23195
23196	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23197	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23198	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23199	SelectionDAG &DAG) {
23200	MVT VT = Op.getSimpleValueType();
23201	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&(((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector ()) && "Unknown CTPOP type to handle") ? static_cast< void> (0) : __assert_fail ("(VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) && \"Unknown CTPOP type to handle\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23202, __PRETTY_FUNCTION__))
23202	"Unknown CTPOP type to handle")(((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector ()) && "Unknown CTPOP type to handle") ? static_cast< void> (0) : __assert_fail ("(VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) && \"Unknown CTPOP type to handle\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23202, __PRETTY_FUNCTION__));
23203	SDLoc DL(Op.getNode());
23204	SDValue Op0 = Op.getOperand(0);
23205
23206	if (!Subtarget.hasSSSE3()) {
23207	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
23208	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!")((VT.is128BitVector() && "Only 128-bit vectors supported in SSE!" ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported in SSE!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23208, __PRETTY_FUNCTION__));
23209	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23210	}
23211
23212	// Decompose 256-bit ops into smaller 128-bit ops.
23213	if (VT.is256BitVector() && !Subtarget.hasInt256())
23214	return Lower256IntUnary(Op, DAG);
23215
23216	// Decompose 512-bit ops into smaller 256-bit ops.
23217	if (VT.is512BitVector() && !Subtarget.hasBWI())
23218	return Lower512IntUnary(Op, DAG);
23219
23220	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23221	}
23222
23223	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23224	SelectionDAG &DAG) {
23225	assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count." ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23226, __PRETTY_FUNCTION__))
23226	"We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count." ) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23226, __PRETTY_FUNCTION__));
23227	return LowerVectorCTPOP(Op, Subtarget, DAG);
23228	}
23229
23230	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23231	MVT VT = Op.getSimpleValueType();
23232	SDValue In = Op.getOperand(0);
23233	SDLoc DL(Op);
23234
23235	// For scalars, its still beneficial to transfer to/from the SIMD unit to
23236	// perform the BITREVERSE.
23237	if (!VT.isVector()) {
23238	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23239	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23240	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23241	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23242	DAG.getIntPtrConstant(0, DL));
23243	}
23244
23245	int NumElts = VT.getVectorNumElements();
23246	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23247
23248	// Decompose 256-bit ops into smaller 128-bit ops.
23249	if (VT.is256BitVector())
23250	return Lower256IntUnary(Op, DAG);
23251
23252	assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23253, __PRETTY_FUNCTION__))
23253	"Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported." ) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23253, __PRETTY_FUNCTION__));
23254
23255	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23256	// perform the BSWAP in the shuffle.
23257	// Its best to shuffle using the second operand as this will implicitly allow
23258	// memory folding for multiple vectors.
23259	SmallVector<SDValue, 16> MaskElts;
23260	for (int i = 0; i != NumElts; ++i) {
23261	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23262	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23263	int PermuteByte = SourceByte \| (2 << 5);
23264	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23265	}
23266	}
23267
23268	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23269	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23270	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23271	Res, Mask);
23272	return DAG.getBitcast(VT, Res);
23273	}
23274
23275	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23276	SelectionDAG &DAG) {
23277	if (Subtarget.hasXOP())
23278	return LowerBITREVERSE_XOP(Op, DAG);
23279
23280	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23280, __PRETTY_FUNCTION__));
23281
23282	MVT VT = Op.getSimpleValueType();
23283	SDValue In = Op.getOperand(0);
23284	SDLoc DL(Op);
23285
23286	unsigned NumElts = VT.getVectorNumElements();
23287	assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23288, __PRETTY_FUNCTION__))
23288	"Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23288, __PRETTY_FUNCTION__));
23289
23290	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23291	if (VT.is256BitVector() && !Subtarget.hasInt256())
23292	return Lower256IntUnary(Op, DAG);
23293
23294	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23295	// two nibbles and a PSHUFB lookup to find the bitreverse of each
23296	// 0-15 value (moved to the other nibble).
23297	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23298	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23299	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23300
23301	const int LoLUT[16] = {
23302	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
23303	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
23304	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
23305	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
23306	const int HiLUT[16] = {
23307	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
23308	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
23309	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
23310	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};
23311
23312	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23313	for (unsigned i = 0; i < NumElts; ++i) {
23314	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23315	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23316	}
23317
23318	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23319	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23320	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23321	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23322	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23323	}
23324
23325	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23326	unsigned NewOpc = 0;
23327	switch (N->getOpcode()) {
23328	case ISD::ATOMIC_LOAD_ADD:
23329	NewOpc = X86ISD::LADD;
23330	break;
23331	case ISD::ATOMIC_LOAD_SUB:
23332	NewOpc = X86ISD::LSUB;
23333	break;
23334	case ISD::ATOMIC_LOAD_OR:
23335	NewOpc = X86ISD::LOR;
23336	break;
23337	case ISD::ATOMIC_LOAD_XOR:
23338	NewOpc = X86ISD::LXOR;
23339	break;
23340	case ISD::ATOMIC_LOAD_AND:
23341	NewOpc = X86ISD::LAND;
23342	break;
23343	default:
23344	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23344);
23345	}
23346
23347	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23348	return DAG.getMemIntrinsicNode(
23349	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23350	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23351	/MemVT=/N->getSimpleValueType(0), MMO);
23352	}
23353
23354	/// Lower atomic_load_ops into LOCK-prefixed operations.
23355	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23356	const X86Subtarget &Subtarget) {
23357	SDValue Chain = N->getOperand(0);
23358	SDValue LHS = N->getOperand(1);
23359	SDValue RHS = N->getOperand(2);
23360	unsigned Opc = N->getOpcode();
23361	MVT VT = N->getSimpleValueType(0);
23362	SDLoc DL(N);
23363
23364	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23365	// can only be lowered when the result is unused. They should have already
23366	// been transformed into a cmpxchg loop in AtomicExpand.
23367	if (N->hasAnyUseOfValue(0)) {
23368	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23369	// select LXADD if LOCK_SUB can't be selected.
23370	if (Opc == ISD::ATOMIC_LOAD_SUB) {
23371	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23372	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23373	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23374	RHS, AN->getMemOperand());
23375	}
23376	assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!" ) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23377, __PRETTY_FUNCTION__))
23377	"Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!" ) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23377, __PRETTY_FUNCTION__));
23378	return N;
23379	}
23380
23381	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23382	// RAUW the chain, but don't worry about the result, as it's unused.
23383	assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23383, __PRETTY_FUNCTION__));
23384	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23385	return SDValue();
23386	}
23387
23388	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23389	SDNode *Node = Op.getNode();
23390	SDLoc dl(Node);
23391	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23392
23393	// Convert seq_cst store -> xchg
23394	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23395	// FIXME: On 32-bit, store -> fist or movq would be more efficient
23396	// (The only way to get a 16-byte store is cmpxchg16b)
23397	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23398	if (cast<AtomicSDNode>(Node)->getOrdering() ==
23399	AtomicOrdering::SequentiallyConsistent \|\|
23400	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23401	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23402	cast<AtomicSDNode>(Node)->getMemoryVT(),
23403	Node->getOperand(0),
23404	Node->getOperand(1), Node->getOperand(2),
23405	cast<AtomicSDNode>(Node)->getMemOperand());
23406	return Swap.getValue(1);
23407	}
23408	// Other atomic stores have a simple pattern.
23409	return Op;
23410	}
23411
23412	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23413	SDNode *N = Op.getNode();
23414	MVT VT = N->getSimpleValueType(0);
23415
23416	// Let legalize expand this if it isn't a legal type yet.
23417	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23418	return SDValue();
23419
23420	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23421	SDLoc DL(N);
23422
23423	// Set the carry flag.
23424	SDValue Carry = Op.getOperand(2);
23425	EVT CarryVT = Carry.getValueType();
23426	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23427	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23428	Carry, DAG.getConstant(NegOne, DL, CarryVT));
23429
23430	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23431	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23432	Op.getOperand(1), Carry.getValue(1));
23433
23434	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23435	if (N->getValueType(1) == MVT::i1)
23436	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23437
23438	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23439	}
23440
23441	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23442	SelectionDAG &DAG) {
23443	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23443, __PRETTY_FUNCTION__));
23444
23445	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
23446	// which returns the values as { float, float } (in XMM0) or
23447	// { double, double } (which is returned in XMM0, XMM1).
23448	SDLoc dl(Op);
23449	SDValue Arg = Op.getOperand(0);
23450	EVT ArgVT = Arg.getValueType();
23451	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
23452
23453	TargetLowering::ArgListTy Args;
23454	TargetLowering::ArgListEntry Entry;
23455
23456	Entry.Node = Arg;
23457	Entry.Ty = ArgTy;
23458	Entry.IsSExt = false;
23459	Entry.IsZExt = false;
23460	Args.push_back(Entry);
23461
23462	bool isF64 = ArgVT == MVT::f64;
23463	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
23464	// the small struct {f32, f32} is returned in (eax, edx). For f64,
23465	// the results are returned via SRet in memory.
23466	const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23467	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23468	SDValue Callee =
23469	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23470
23471	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
23472	: (Type *)VectorType::get(ArgTy, 4);
23473
23474	TargetLowering::CallLoweringInfo CLI(DAG);
23475	CLI.setDebugLoc(dl)
23476	.setChain(DAG.getEntryNode())
23477	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23478
23479	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23480
23481	if (isF64)
23482	// Returned in xmm0 and xmm1.
23483	return CallResult.first;
23484
23485	// Returned in bits 0:31 and 32:64 xmm0.
23486	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23487	CallResult.first, DAG.getIntPtrConstant(0, dl));
23488	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23489	CallResult.first, DAG.getIntPtrConstant(1, dl));
23490	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23491	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23492	}
23493
23494	/// Widen a vector input to a vector of NVT. The
23495	/// input vector must have the same element type as NVT.
23496	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23497	bool FillWithZeroes = false) {
23498	// Check if InOp already has the right width.
23499	MVT InVT = InOp.getSimpleValueType();
23500	if (InVT == NVT)
23501	return InOp;
23502
23503	if (InOp.isUndef())
23504	return DAG.getUNDEF(NVT);
23505
23506	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match") ? static_cast< void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23507, __PRETTY_FUNCTION__))
23507	"input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match") ? static_cast< void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23507, __PRETTY_FUNCTION__));
23508
23509	unsigned InNumElts = InVT.getVectorNumElements();
23510	unsigned WidenNumElts = NVT.getVectorNumElements();
23511	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening") ? static_cast <void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23512, __PRETTY_FUNCTION__))
23512	"Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening") ? static_cast <void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23512, __PRETTY_FUNCTION__));
23513
23514	SDLoc dl(InOp);
23515	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23516	InOp.getNumOperands() == 2) {
23517	SDValue N1 = InOp.getOperand(1);
23518	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
23519	N1.isUndef()) {
23520	InOp = InOp.getOperand(0);
23521	InVT = InOp.getSimpleValueType();
23522	InNumElts = InVT.getVectorNumElements();
23523	}
23524	}
23525	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
23526	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23527	SmallVector<SDValue, 16> Ops;
23528	for (unsigned i = 0; i < InNumElts; ++i)
23529	Ops.push_back(InOp.getOperand(i));
23530
23531	EVT EltVT = InOp.getOperand(0).getValueType();
23532
23533	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23534	DAG.getUNDEF(EltVT);
23535	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23536	Ops.push_back(FillVal);
23537	return DAG.getBuildVector(NVT, dl, Ops);
23538	}
23539	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23540	DAG.getUNDEF(NVT);
23541	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23542	InOp, DAG.getIntPtrConstant(0, dl));
23543	}
23544
23545	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23546	SelectionDAG &DAG) {
23547	assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23548, __PRETTY_FUNCTION__))
23548	"MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23548, __PRETTY_FUNCTION__));
23549
23550	// X86 scatter kills mask register, so its type should be added to
23551	// the list of return values.
23552	// If the "scatter" has 2 return values, it is already handled.
23553	if (Op.getNode()->getNumValues() == 2)
23554	return Op;
23555
23556	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23557	SDValue Src = N->getValue();
23558	MVT VT = Src.getSimpleValueType();
23559	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23559, __PRETTY_FUNCTION__));
23560	SDLoc dl(Op);
23561
23562	SDValue NewScatter;
23563	SDValue Index = N->getIndex();
23564	SDValue Mask = N->getMask();
23565	SDValue Chain = N->getChain();
23566	SDValue BasePtr = N->getBasePtr();
23567	MVT MemVT = N->getMemoryVT().getSimpleVT();
23568	MVT IndexVT = Index.getSimpleValueType();
23569	MVT MaskVT = Mask.getSimpleValueType();
23570
23571	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23572	// The v2i32 value was promoted to v2i64.
23573	// Now we "redo" the type legalizer's work and widen the original
23574	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23575	// with a shuffle.
23576	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&(((MemVT == MVT::v2i32 && VT == MVT::v2i64) && "Unexpected memory type") ? static_cast<void> (0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23577, __PRETTY_FUNCTION__))
23577	"Unexpected memory type")(((MemVT == MVT::v2i32 && VT == MVT::v2i64) && "Unexpected memory type") ? static_cast<void> (0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23577, __PRETTY_FUNCTION__));
23578	int ShuffleMask[] = {0, 2, -1, -1};
23579	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23580	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23581	// Now we have 4 elements instead of 2.
23582	// Expand the index.
23583	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23584	Index = ExtendToType(Index, NewIndexVT, DAG);
23585
23586	// Expand the mask with zeroes
23587	// Mask may be <2 x i64> or <2 x i1> at this moment
23588	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&(((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) && "Unexpected mask type" ) ? static_cast<void> (0) : __assert_fail ("(MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) && \"Unexpected mask type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23589, __PRETTY_FUNCTION__))
23589	"Unexpected mask type")(((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) && "Unexpected mask type" ) ? static_cast<void> (0) : __assert_fail ("(MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) && \"Unexpected mask type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23589, __PRETTY_FUNCTION__));
23590	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23591	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23592	VT = MVT::v4i32;
23593	}
23594
23595	unsigned NumElts = VT.getVectorNumElements();
23596	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23597	!Index.getSimpleValueType().is512BitVector()) {
23598	// AVX512F supports only 512-bit vectors. Or data or index should
23599	// be 512 bit wide. If now the both index and data are 256-bit, but
23600	// the vector contains 8 elements, we just sign-extend the index
23601	if (IndexVT == MVT::v8i32)
23602	// Just extend index
23603	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23604	else {
23605	// The minimal number of elts in scatter is 8
23606	NumElts = 8;
23607	// Index
23608	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23609	// Use original index here, do not modify the index twice
23610	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23611	if (IndexVT.getScalarType() == MVT::i32)
23612	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23613
23614	// Mask
23615	// At this point we have promoted mask operand
23616	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type")((MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type" ) ? static_cast<void> (0) : __assert_fail ("MaskVT.getScalarSizeInBits() >= 32 && \"unexpected mask type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23616, __PRETTY_FUNCTION__));
23617	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23618	// Use the original mask here, do not modify the mask twice
23619	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23620
23621	// The value that should be stored
23622	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23623	Src = ExtendToType(Src, NewVT, DAG);
23624	}
23625	}
23626	// If the mask is "wide" at this point - truncate it to i1 vector
23627	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23628	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23629
23630	// The mask is killed by scatter, add it to the values
23631	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23632	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23633	NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23634	N->getMemOperand());
23635	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23636	return SDValue(NewScatter.getNode(), 1);
23637	}
23638
23639	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23640	SelectionDAG &DAG) {
23641
23642	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23643	MVT VT = Op.getSimpleValueType();
23644	MVT ScalarVT = VT.getScalarType();
23645	SDValue Mask = N->getMask();
23646	SDLoc dl(Op);
23647
23648	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23649, __PRETTY_FUNCTION__))
23649	"Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23649, __PRETTY_FUNCTION__));
23650
23651	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23652, __PRETTY_FUNCTION__))
23652	"Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23652, __PRETTY_FUNCTION__));
23653
23654	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23655	// VLX. These types for exp-loads are handled here.
23656	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23657	return Op;
23658
23659	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op." ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23660, __PRETTY_FUNCTION__))
23660	"Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op." ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23660, __PRETTY_FUNCTION__));
23661
23662	assert((ScalarVT.getSizeInBits() >= 32 \|\|(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked load op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23665, __PRETTY_FUNCTION__))
23663	(Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked load op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23665, __PRETTY_FUNCTION__))
23664	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked load op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23665, __PRETTY_FUNCTION__))
23665	"Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked load op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23665, __PRETTY_FUNCTION__));
23666
23667	// This operation is legal for targets with VLX, but without
23668	// VLX the vector should be widened to 512 bit
23669	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23670	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23671	SDValue Src0 = N->getSrc0();
23672	Src0 = ExtendToType(Src0, WideDataVT, DAG);
23673
23674	// Mask element has to be i1.
23675	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23676	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&(((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case" ) ? static_cast<void> (0) : __assert_fail ("(MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23677, __PRETTY_FUNCTION__))
23677	"We handle 4x32, 4x64 and 2x64 vectors only in this case")(((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case" ) ? static_cast<void> (0) : __assert_fail ("(MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23677, __PRETTY_FUNCTION__));
23678
23679	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23680
23681	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23682	if (MaskEltTy != MVT::i1)
23683	Mask = DAG.getNode(ISD::TRUNCATE, dl,
23684	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23685	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23686	N->getBasePtr(), Mask, Src0,
23687	N->getMemoryVT(), N->getMemOperand(),
23688	N->getExtensionType(),
23689	N->isExpandingLoad());
23690
23691	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23692	NewLoad.getValue(0),
23693	DAG.getIntPtrConstant(0, dl));
23694	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23695	return DAG.getMergeValues(RetOps, dl);
23696	}
23697
23698	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23699	SelectionDAG &DAG) {
23700	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23701	SDValue DataToStore = N->getValue();
23702	MVT VT = DataToStore.getSimpleValueType();
23703	MVT ScalarVT = VT.getScalarType();
23704	SDValue Mask = N->getMask();
23705	SDLoc dl(Op);
23706
23707	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&(((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() \|\| Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23708, __PRETTY_FUNCTION__))
23708	"Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() \|\| Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23708, __PRETTY_FUNCTION__));
23709
23710	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23711, __PRETTY_FUNCTION__))
23711	"Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23711, __PRETTY_FUNCTION__));
23712
23713	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23714	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23715	return Op;
23716
23717	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op." ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23718, __PRETTY_FUNCTION__))
23718	"Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op." ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23718, __PRETTY_FUNCTION__));
23719
23720	assert((ScalarVT.getSizeInBits() >= 32 \|\|(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked store op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23723, __PRETTY_FUNCTION__))
23721	(Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked store op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23723, __PRETTY_FUNCTION__))
23722	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked store op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23723, __PRETTY_FUNCTION__))
23723	"Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && "Unsupported masked store op." ) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 \|\| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23723, __PRETTY_FUNCTION__));
23724
23725	// This operation is legal for targets with VLX, but without
23726	// VLX the vector should be widened to 512 bit
23727	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23728	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23729
23730	// Mask element has to be i1.
23731	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23732	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&(((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case" ) ? static_cast<void> (0) : __assert_fail ("(MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23733, __PRETTY_FUNCTION__))
23733	"We handle 4x32, 4x64 and 2x64 vectors only in this case")(((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case" ) ? static_cast<void> (0) : __assert_fail ("(MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23733, __PRETTY_FUNCTION__));
23734
23735	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23736
23737	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23738	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23739	if (MaskEltTy != MVT::i1)
23740	Mask = DAG.getNode(ISD::TRUNCATE, dl,
23741	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23742	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23743	Mask, N->getMemoryVT(), N->getMemOperand(),
23744	N->isTruncatingStore(), N->isCompressingStore());
23745	}
23746
23747	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23748	SelectionDAG &DAG) {
23749	assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23750, __PRETTY_FUNCTION__))
23750	"MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23750, __PRETTY_FUNCTION__));
23751
23752	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23753	SDLoc dl(Op);
23754	MVT VT = Op.getSimpleValueType();
23755	SDValue Index = N->getIndex();
23756	SDValue Mask = N->getMask();
23757	SDValue Src0 = N->getValue();
23758	MVT IndexVT = Index.getSimpleValueType();
23759	MVT MaskVT = Mask.getSimpleValueType();
23760
23761	unsigned NumElts = VT.getVectorNumElements();
23762	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op" ) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23762, __PRETTY_FUNCTION__));
23763
23764	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23765	!Index.getSimpleValueType().is512BitVector()) {
23766	// AVX512F supports only 512-bit vectors. Or data or index should
23767	// be 512 bit wide. If now the both index and data are 256-bit, but
23768	// the vector contains 8 elements, we just sign-extend the index
23769	if (NumElts == 8) {
23770	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23771	SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23772	N->getOperand(3), Index };
23773	DAG.UpdateNodeOperands(N, Ops);
23774	return Op;
23775	}
23776
23777	// Minimal number of elements in Gather
23778	NumElts = 8;
23779	// Index
23780	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23781	Index = ExtendToType(Index, NewIndexVT, DAG);
23782	if (IndexVT.getScalarType() == MVT::i32)
23783	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23784
23785	// Mask
23786	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23787	// At this point we have promoted mask operand
23788	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type")((MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type" ) ? static_cast<void> (0) : __assert_fail ("MaskVT.getScalarSizeInBits() >= 32 && \"unexpected mask type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23788, __PRETTY_FUNCTION__));
23789	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23790	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23791	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23792
23793	// The pass-through value
23794	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23795	Src0 = ExtendToType(Src0, NewVT, DAG);
23796
23797	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23798	SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23799	N->getMemoryVT(), dl, Ops,
23800	N->getMemOperand());
23801	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23802	NewGather.getValue(0),
23803	DAG.getIntPtrConstant(0, dl));
23804	SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23805	return DAG.getMergeValues(RetOps, dl);
23806	}
23807	if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23808	// There is a special case when the return type is v2i32 is illegal and
23809	// the type legaizer extended it to v2i64. Without this conversion we end up
23810	// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23811	// In order to avoid this situation, we'll build an X86 specific Gather node
23812	// with index v2i64 and value type v4i32.
23813	assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&((VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && "Unexpected type in masked gather") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && \"Unexpected type in masked gather\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23814, __PRETTY_FUNCTION__))
23814	"Unexpected type in masked gather")((VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && "Unexpected type in masked gather") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && \"Unexpected type in masked gather\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23814, __PRETTY_FUNCTION__));
23815	Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23816	DAG.getBitcast(MVT::v4i32, Src0),
23817	DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23818	// The mask should match the destination type. Extending mask with zeroes
23819	// is not necessary since instruction itself reads only two values from
23820	// memory.
23821	Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23822	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23823	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23824	DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23825	N->getMemOperand());
23826
23827	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23828	NewGather.getValue(0), DAG);
23829	SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23830	return DAG.getMergeValues(RetOps, dl);
23831	}
23832	if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23833	// This transformation is for optimization only.
23834	// The type legalizer extended mask and index to 4 elements vector
23835	// in order to match requirements of the common gather node - same
23836	// vector width of index and value. X86 Gather node allows mismatch
23837	// of vector width in order to select more optimal instruction at the
23838	// end.
23839	assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&((VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && "Unexpected type in masked gather") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && \"Unexpected type in masked gather\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23840, __PRETTY_FUNCTION__))
23840	"Unexpected type in masked gather")((VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && "Unexpected type in masked gather") ? static_cast <void> (0) : __assert_fail ("VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && \"Unexpected type in masked gather\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23840, __PRETTY_FUNCTION__));
23841	if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23842	ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23843	Index.getOpcode() == ISD::CONCAT_VECTORS &&
23844	Index.getOperand(1).isUndef()) {
23845	Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23846	Index = Index.getOperand(0);
23847	} else
23848	return Op;
23849	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23850	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23851	DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23852	N->getMemOperand());
23853
23854	SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23855	return DAG.getMergeValues(RetOps, dl);
23856
23857	}
23858	return Op;
23859	}
23860
23861	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23862	SelectionDAG &DAG) const {
23863	// TODO: Eventually, the lowering of these nodes should be informed by or
23864	// deferred to the GC strategy for the function in which they appear. For
23865	// now, however, they must be lowered to something. Since they are logically
23866	// no-ops in the case of a null GC strategy (or a GC strategy which does not
23867	// require special handling for these nodes), lower them as literal NOOPs for
23868	// the time being.
23869	SmallVector<SDValue, 2> Ops;
23870
23871	Ops.push_back(Op.getOperand(0));
23872	if (Op->getGluedNode())
23873	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23874
23875	SDLoc OpDL(Op);
23876	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23877	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23878
23879	return NOOP;
23880	}
23881
23882	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23883	SelectionDAG &DAG) const {
23884	// TODO: Eventually, the lowering of these nodes should be informed by or
23885	// deferred to the GC strategy for the function in which they appear. For
23886	// now, however, they must be lowered to something. Since they are logically
23887	// no-ops in the case of a null GC strategy (or a GC strategy which does not
23888	// require special handling for these nodes), lower them as literal NOOPs for
23889	// the time being.
23890	SmallVector<SDValue, 2> Ops;
23891
23892	Ops.push_back(Op.getOperand(0));
23893	if (Op->getGluedNode())
23894	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23895
23896	SDLoc OpDL(Op);
23897	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23898	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23899
23900	return NOOP;
23901	}
23902
23903	/// Provide custom lowering hooks for some operations.
23904	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23905	switch (Op.getOpcode()) {
23906	default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 23906);
23907	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23908	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23909	return LowerCMP_SWAP(Op, Subtarget, DAG);
23910	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23911	case ISD::ATOMIC_LOAD_ADD:
23912	case ISD::ATOMIC_LOAD_SUB:
23913	case ISD::ATOMIC_LOAD_OR:
23914	case ISD::ATOMIC_LOAD_XOR:
23915	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23916	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23917	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23918	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23919	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23920	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23921	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23922	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23923	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23924	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23925	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23926	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23927	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23928	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23929	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23930	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23931	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23932	case ISD::SHL_PARTS:
23933	case ISD::SRA_PARTS:
23934	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23935	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23936	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23937	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23938	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23939	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23940	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23941	case ISD::ZERO_EXTEND_VECTOR_INREG:
23942	case ISD::SIGN_EXTEND_VECTOR_INREG:
23943	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23944	case ISD::FP_TO_SINT:
23945	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23946	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23947	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23948	case ISD::FABS:
23949	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23950	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23951	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23952	case ISD::SETCC: return LowerSETCC(Op, DAG);
23953	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
23954	case ISD::SELECT: return LowerSELECT(Op, DAG);
23955	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23956	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23957	case ISD::VASTART: return LowerVASTART(Op, DAG);
23958	case ISD::VAARG: return LowerVAARG(Op, DAG);
23959	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23960	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23961	case ISD::INTRINSIC_VOID:
23962	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23963	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23964	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23965	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23966	case ISD::FRAME_TO_ARGS_OFFSET:
23967	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23968	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23969	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23970	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23971	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23972	case ISD::EH_SJLJ_SETUP_DISPATCH:
23973	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23974	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23975	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23976	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23977	case ISD::CTLZ:
23978	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23979	case ISD::CTTZ:
23980	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23981	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23982	case ISD::MULHS:
23983	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23984	case ISD::UMUL_LOHI:
23985	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23986	case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23987	case ISD::SRA:
23988	case ISD::SRL:
23989	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23990	case ISD::SADDO:
23991	case ISD::UADDO:
23992	case ISD::SSUBO:
23993	case ISD::USUBO:
23994	case ISD::SMULO:
23995	case ISD::UMULO: return LowerXALUO(Op, DAG);
23996	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23997	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23998	case ISD::ADDCARRY:
23999	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24000	case ISD::ADD:
24001	case ISD::SUB: return LowerADD_SUB(Op, DAG);
24002	case ISD::SMAX:
24003	case ISD::SMIN:
24004	case ISD::UMAX:
24005	case ISD::UMIN: return LowerMINMAX(Op, DAG);
24006	case ISD::ABS: return LowerABS(Op, DAG);
24007	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24008	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24009	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24010	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24011	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24012	case ISD::GC_TRANSITION_START:
24013	return LowerGC_TRANSITION_START(Op, DAG);
24014	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24015	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24016	}
24017	}
24018
24019	/// Places new result values for the node in Results (their number
24020	/// and types must exactly match those of the original return values of
24021	/// the node), or leaves Results empty, which indicates that the node is not
24022	/// to be custom lowered after all.
24023	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24024	SmallVectorImpl<SDValue> &Results,
24025	SelectionDAG &DAG) const {
24026	SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24027
24028	if (!Res.getNode())
24029	return;
24030
24031	assert((N->getNumValues() <= Res->getNumValues()) &&(((N->getNumValues() <= Res->getNumValues()) && "Lowering returned the wrong number of results!") ? static_cast <void> (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24032, __PRETTY_FUNCTION__))
24032	"Lowering returned the wrong number of results!")(((N->getNumValues() <= Res->getNumValues()) && "Lowering returned the wrong number of results!") ? static_cast <void> (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24032, __PRETTY_FUNCTION__));
24033
24034	// Places new result values base on N result number.
24035	// In some cases (LowerSINT_TO_FP for example) Res has more result values
24036	// than original node, chain should be dropped(last value).
24037	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24038	Results.push_back(Res.getValue(I));
24039	}
24040
24041	/// Replace a node with an illegal result type with a new node built out of
24042	/// custom code.
24043	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24044	SmallVectorImpl<SDValue>&Results,
24045	SelectionDAG &DAG) const {
24046	SDLoc dl(N);
24047	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24048	switch (N->getOpcode()) {
24049	default:
24050	llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24050);
24051	case X86ISD::AVG: {
24052	// Legalize types for X86ISD::AVG by expanding vectors.
24053	assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24053, __PRETTY_FUNCTION__));
24054
24055	auto InVT = N->getValueType(0);
24056	auto InVTSize = InVT.getSizeInBits();
24057	const unsigned RegSize =
24058	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24059	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&(((Subtarget.hasBWI() \|\| RegSize < 512) && "512-bit vector requires AVX512BW" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| RegSize < 512) && \"512-bit vector requires AVX512BW\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24060, __PRETTY_FUNCTION__))
24060	"512-bit vector requires AVX512BW")(((Subtarget.hasBWI() \|\| RegSize < 512) && "512-bit vector requires AVX512BW" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI() \|\| RegSize < 512) && \"512-bit vector requires AVX512BW\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24060, __PRETTY_FUNCTION__));
24061	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&(((Subtarget.hasAVX2() \|\| RegSize < 256) && "256-bit vector requires AVX2" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX2() \|\| RegSize < 256) && \"256-bit vector requires AVX2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24062, __PRETTY_FUNCTION__))
24062	"256-bit vector requires AVX2")(((Subtarget.hasAVX2() \|\| RegSize < 256) && "256-bit vector requires AVX2" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX2() \|\| RegSize < 256) && \"256-bit vector requires AVX2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24062, __PRETTY_FUNCTION__));
24063
24064	auto ElemVT = InVT.getVectorElementType();
24065	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24066	RegSize / ElemVT.getSizeInBits());
24067	assert(RegSize % InVT.getSizeInBits() == 0)((RegSize % InVT.getSizeInBits() == 0) ? static_cast<void> (0) : __assert_fail ("RegSize % InVT.getSizeInBits() == 0", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24067, __PRETTY_FUNCTION__));
24068	unsigned NumConcat = RegSize / InVT.getSizeInBits();
24069
24070	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24071	Ops[0] = N->getOperand(0);
24072	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24073	Ops[0] = N->getOperand(1);
24074	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24075
24076	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24077	Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24078	DAG.getIntPtrConstant(0, dl)));
24079	return;
24080	}
24081	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24082	case X86ISD::FMINC:
24083	case X86ISD::FMIN:
24084	case X86ISD::FMAXC:
24085	case X86ISD::FMAX: {
24086	EVT VT = N->getValueType(0);
24087	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX." ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24087, __PRETTY_FUNCTION__));
24088	SDValue UNDEF = DAG.getUNDEF(VT);
24089	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24090	N->getOperand(0), UNDEF);
24091	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24092	N->getOperand(1), UNDEF);
24093	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24094	return;
24095	}
24096	case ISD::SDIV:
24097	case ISD::UDIV:
24098	case ISD::SREM:
24099	case ISD::UREM:
24100	case ISD::SDIVREM:
24101	case ISD::UDIVREM: {
24102	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24103	Results.push_back(V);
24104	return;
24105	}
24106	case ISD::FP_TO_SINT:
24107	case ISD::FP_TO_UINT: {
24108	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24109
24110	if (N->getValueType(0) == MVT::v2i32) {
24111	assert((IsSigned \|\| Subtarget.hasAVX512()) &&(((IsSigned \|\| Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512" ) ? static_cast<void> (0) : __assert_fail ("(IsSigned \|\| Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24112, __PRETTY_FUNCTION__))
24112	"Can only handle signed conversion without AVX512")(((IsSigned \|\| Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512" ) ? static_cast<void> (0) : __assert_fail ("(IsSigned \|\| Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24112, __PRETTY_FUNCTION__));
24113	assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24113, __PRETTY_FUNCTION__));
24114	SDValue Src = N->getOperand(0);
24115	if (Src.getValueType() == MVT::v2f64) {
24116	SDValue Idx = DAG.getIntPtrConstant(0, dl);
24117	SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24118	: X86ISD::CVTTP2UI,
24119	dl, MVT::v4i32, Src);
24120	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24121	Results.push_back(Res);
24122	return;
24123	}
24124	if (Src.getValueType() == MVT::v2f32) {
24125	SDValue Idx = DAG.getIntPtrConstant(0, dl);
24126	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24127	DAG.getUNDEF(MVT::v2f32));
24128	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24129	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24130	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24131	Results.push_back(Res);
24132	return;
24133	}
24134
24135	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24136	// so early out here.
24137	return;
24138	}
24139
24140	std::pair<SDValue,SDValue> Vals =
24141	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
24142	SDValue FIST = Vals.first, StackSlot = Vals.second;
24143	if (FIST.getNode()) {
24144	EVT VT = N->getValueType(0);
24145	// Return a load from the stack slot.
24146	if (StackSlot.getNode())
24147	Results.push_back(
24148	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24149	else
24150	Results.push_back(FIST);
24151	}
24152	return;
24153	}
24154	case ISD::SINT_TO_FP: {
24155	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")((Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24155, __PRETTY_FUNCTION__));
24156	SDValue Src = N->getOperand(0);
24157	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
24158	return;
24159	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24160	return;
24161	}
24162	case ISD::UINT_TO_FP: {
24163	assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24163, __PRETTY_FUNCTION__));
24164	EVT VT = N->getValueType(0);
24165	if (VT != MVT::v2f32)
24166	return;
24167	SDValue Src = N->getOperand(0);
24168	EVT SrcVT = Src.getValueType();
24169	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24170	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24171	return;
24172	}
24173	if (SrcVT != MVT::v2i32)
24174	return;
24175	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24176	SDValue VBias =
24177	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24178	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24179	DAG.getBitcast(MVT::v2i64, VBias));
24180	Or = DAG.getBitcast(MVT::v2f64, Or);
24181	// TODO: Are there any fast-math-flags to propagate here?
24182	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24183	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24184	return;
24185	}
24186	case ISD::FP_ROUND: {
24187	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24188	return;
24189	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24190	Results.push_back(V);
24191	return;
24192	}
24193	case ISD::FP_EXTEND: {
24194	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24195	// No other ValueType for FP_EXTEND should reach this point.
24196	assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node" ) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24197, __PRETTY_FUNCTION__))
24197	"Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node" ) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24197, __PRETTY_FUNCTION__));
24198	return;
24199	}
24200	case ISD::INTRINSIC_W_CHAIN: {
24201	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24202	switch (IntNo) {
24203	default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type " "legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24204)
24204	"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type " "legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24204);
24205	case Intrinsic::x86_rdtsc:
24206	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24207	Results);
24208	case Intrinsic::x86_rdtscp:
24209	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24210	Results);
24211	case Intrinsic::x86_rdpmc:
24212	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24213
24214	case Intrinsic::x86_xgetbv:
24215	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24216	}
24217	}
24218	case ISD::INTRINSIC_WO_CHAIN: {
24219	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24220	Results.push_back(V);
24221	return;
24222	}
24223	case ISD::READCYCLECOUNTER: {
24224	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24225	Results);
24226	}
24227	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24228	EVT T = N->getValueType(0);
24229	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair" ) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 \|\| T == MVT::i128) && \"can only expand cmpxchg pair\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24229, __PRETTY_FUNCTION__));
24230	bool Regs64bit = T == MVT::i128;
24231	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24232	SDValue cpInL, cpInH;
24233	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24234	DAG.getConstant(0, dl, HalfT));
24235	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24236	DAG.getConstant(1, dl, HalfT));
24237	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24238	Regs64bit ? X86::RAX : X86::EAX,
24239	cpInL, SDValue());
24240	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24241	Regs64bit ? X86::RDX : X86::EDX,
24242	cpInH, cpInL.getValue(1));
24243	SDValue swapInL, swapInH;
24244	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24245	DAG.getConstant(0, dl, HalfT));
24246	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24247	DAG.getConstant(1, dl, HalfT));
24248	swapInH =
24249	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24250	swapInH, cpInH.getValue(1));
24251	// If the current function needs the base pointer, RBX,
24252	// we shouldn't use cmpxchg directly.
24253	// Indeed the lowering of that instruction will clobber
24254	// that register and since RBX will be a reserved register
24255	// the register allocator will not make sure its value will
24256	// be properly saved and restored around this live-range.
24257	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24258	SDValue Result;
24259	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24260	unsigned BasePtr = TRI->getBaseRegister();
24261	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24262	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24263	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
24264	// ISel prefers the LCMPXCHG64 variant.
24265	// If that assert breaks, that means it is not the case anymore,
24266	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24267	// not just EBX. This is a matter of accepting i64 input for that
24268	// pseudo, and restoring into the register of the right wide
24269	// in expand pseudo. Everything else should just work.
24270	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&((((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX ) && "Saving only half of the RBX") ? static_cast< void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) && \"Saving only half of the RBX\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24271, __PRETTY_FUNCTION__))
24271	"Saving only half of the RBX")((((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX ) && "Saving only half of the RBX") ? static_cast< void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) && \"Saving only half of the RBX\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24271, __PRETTY_FUNCTION__));
24272	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24273	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24274	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24275	Regs64bit ? X86::RBX : X86::EBX,
24276	HalfT, swapInH.getValue(1));
24277	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24278	RBXSave,
24279	/Glue/ RBXSave.getValue(2)};
24280	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24281	} else {
24282	unsigned Opcode =
24283	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24284	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24285	Regs64bit ? X86::RBX : X86::EBX, swapInL,
24286	swapInH.getValue(1));
24287	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24288	swapInL.getValue(1)};
24289	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24290	}
24291	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24292	Regs64bit ? X86::RAX : X86::EAX,
24293	HalfT, Result.getValue(1));
24294	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24295	Regs64bit ? X86::RDX : X86::EDX,
24296	HalfT, cpOutL.getValue(2));
24297	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24298
24299	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24300	MVT::i32, cpOutH.getValue(2));
24301	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24302	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24303
24304	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24305	Results.push_back(Success);
24306	Results.push_back(EFLAGS.getValue(1));
24307	return;
24308	}
24309	case ISD::ATOMIC_SWAP:
24310	case ISD::ATOMIC_LOAD_ADD:
24311	case ISD::ATOMIC_LOAD_SUB:
24312	case ISD::ATOMIC_LOAD_AND:
24313	case ISD::ATOMIC_LOAD_OR:
24314	case ISD::ATOMIC_LOAD_XOR:
24315	case ISD::ATOMIC_LOAD_NAND:
24316	case ISD::ATOMIC_LOAD_MIN:
24317	case ISD::ATOMIC_LOAD_MAX:
24318	case ISD::ATOMIC_LOAD_UMIN:
24319	case ISD::ATOMIC_LOAD_UMAX:
24320	case ISD::ATOMIC_LOAD: {
24321	// Delegate to generic TypeLegalization. Situations we can really handle
24322	// should have already been dealt with by AtomicExpandPass.cpp.
24323	break;
24324	}
24325	case ISD::BITCAST: {
24326	assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24326, __PRETTY_FUNCTION__));
24327	EVT DstVT = N->getValueType(0);
24328	EVT SrcVT = N->getOperand(0)->getValueType(0);
24329
24330	if (SrcVT != MVT::f64 \|\|
24331	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24332	return;
24333
24334	unsigned NumElts = DstVT.getVectorNumElements();
24335	EVT SVT = DstVT.getVectorElementType();
24336	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
24337	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24338	MVT::v2f64, N->getOperand(0));
24339	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24340
24341	if (ExperimentalVectorWideningLegalization) {
24342	// If we are legalizing vectors by widening, we already have the desired
24343	// legal vector type, just return it.
24344	Results.push_back(ToVecInt);
24345	return;
24346	}
24347
24348	SmallVector<SDValue, 8> Elts;
24349	for (unsigned i = 0, e = NumElts; i != e; ++i)
24350	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24351	ToVecInt, DAG.getIntPtrConstant(i, dl)));
24352
24353	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24354	}
24355	}
24356	}
24357
24358	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24359	switch ((X86ISD::NodeType)Opcode) {
24360	case X86ISD::FIRST_NUMBER: break;
24361	case X86ISD::BSF: return "X86ISD::BSF";
24362	case X86ISD::BSR: return "X86ISD::BSR";
24363	case X86ISD::SHLD: return "X86ISD::SHLD";
24364	case X86ISD::SHRD: return "X86ISD::SHRD";
24365	case X86ISD::FAND: return "X86ISD::FAND";
24366	case X86ISD::FANDN: return "X86ISD::FANDN";
24367	case X86ISD::FOR: return "X86ISD::FOR";
24368	case X86ISD::FXOR: return "X86ISD::FXOR";
24369	case X86ISD::FILD: return "X86ISD::FILD";
24370	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24371	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24372	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24373	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24374	case X86ISD::FLD: return "X86ISD::FLD";
24375	case X86ISD::FST: return "X86ISD::FST";
24376	case X86ISD::CALL: return "X86ISD::CALL";
24377	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24378	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24379	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24380	case X86ISD::BT: return "X86ISD::BT";
24381	case X86ISD::CMP: return "X86ISD::CMP";
24382	case X86ISD::COMI: return "X86ISD::COMI";
24383	case X86ISD::UCOMI: return "X86ISD::UCOMI";
24384	case X86ISD::CMPM: return "X86ISD::CMPM";
24385	case X86ISD::CMPMU: return "X86ISD::CMPMU";
24386	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24387	case X86ISD::SETCC: return "X86ISD::SETCC";
24388	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24389	case X86ISD::FSETCC: return "X86ISD::FSETCC";
24390	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24391	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24392	case X86ISD::CMOV: return "X86ISD::CMOV";
24393	case X86ISD::BRCOND: return "X86ISD::BRCOND";
24394	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24395	case X86ISD::IRET: return "X86ISD::IRET";
24396	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24397	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24398	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24399	case X86ISD::Wrapper: return "X86ISD::Wrapper";
24400	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24401	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24402	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24403	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24404	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24405	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24406	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24407	case X86ISD::PINSRB: return "X86ISD::PINSRB";
24408	case X86ISD::PINSRW: return "X86ISD::PINSRW";
24409	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24410	case X86ISD::ANDNP: return "X86ISD::ANDNP";
24411	case X86ISD::BLENDI: return "X86ISD::BLENDI";
24412	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24413	case X86ISD::ADDUS: return "X86ISD::ADDUS";
24414	case X86ISD::SUBUS: return "X86ISD::SUBUS";
24415	case X86ISD::HADD: return "X86ISD::HADD";
24416	case X86ISD::HSUB: return "X86ISD::HSUB";
24417	case X86ISD::FHADD: return "X86ISD::FHADD";
24418	case X86ISD::FHSUB: return "X86ISD::FHSUB";
24419	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24420	case X86ISD::FMAX: return "X86ISD::FMAX";
24421	case X86ISD::FMAXS: return "X86ISD::FMAXS";
24422	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24423	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24424	case X86ISD::FMIN: return "X86ISD::FMIN";
24425	case X86ISD::FMINS: return "X86ISD::FMINS";
24426	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24427	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24428	case X86ISD::FMAXC: return "X86ISD::FMAXC";
24429	case X86ISD::FMINC: return "X86ISD::FMINC";
24430	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24431	case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24432	case X86ISD::FRCP: return "X86ISD::FRCP";
24433	case X86ISD::FRCPS: return "X86ISD::FRCPS";
24434	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24435	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24436	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24437	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24438	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24439	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24440	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24441	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24442	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24443	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24444	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24445	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24446	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24447	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24448	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24449	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24450	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24451	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24452	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24453	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24454	case X86ISD::LADD: return "X86ISD::LADD";
24455	case X86ISD::LSUB: return "X86ISD::LSUB";
24456	case X86ISD::LOR: return "X86ISD::LOR";
24457	case X86ISD::LXOR: return "X86ISD::LXOR";
24458	case X86ISD::LAND: return "X86ISD::LAND";
24459	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24460	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24461	case X86ISD::VZEXT: return "X86ISD::VZEXT";
24462	case X86ISD::VSEXT: return "X86ISD::VSEXT";
24463	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24464	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24465	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24466	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24467	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24468	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24469	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24470	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24471	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24472	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24473	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24474	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24475	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24476	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24477	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24478	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24479	case X86ISD::VSHL: return "X86ISD::VSHL";
24480	case X86ISD::VSRL: return "X86ISD::VSRL";
24481	case X86ISD::VSRA: return "X86ISD::VSRA";
24482	case X86ISD::VSHLI: return "X86ISD::VSHLI";
24483	case X86ISD::VSRLI: return "X86ISD::VSRLI";
24484	case X86ISD::VSRAI: return "X86ISD::VSRAI";
24485	case X86ISD::VSRAV: return "X86ISD::VSRAV";
24486	case X86ISD::VROTLI: return "X86ISD::VROTLI";
24487	case X86ISD::VROTRI: return "X86ISD::VROTRI";
24488	case X86ISD::VPPERM: return "X86ISD::VPPERM";
24489	case X86ISD::CMPP: return "X86ISD::CMPP";
24490	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24491	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24492	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24493	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24494	case X86ISD::ADD: return "X86ISD::ADD";
24495	case X86ISD::SUB: return "X86ISD::SUB";
24496	case X86ISD::ADC: return "X86ISD::ADC";
24497	case X86ISD::SBB: return "X86ISD::SBB";
24498	case X86ISD::SMUL: return "X86ISD::SMUL";
24499	case X86ISD::UMUL: return "X86ISD::UMUL";
24500	case X86ISD::SMUL8: return "X86ISD::SMUL8";
24501	case X86ISD::UMUL8: return "X86ISD::UMUL8";
24502	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24503	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24504	case X86ISD::INC: return "X86ISD::INC";
24505	case X86ISD::DEC: return "X86ISD::DEC";
24506	case X86ISD::OR: return "X86ISD::OR";
24507	case X86ISD::XOR: return "X86ISD::XOR";
24508	case X86ISD::AND: return "X86ISD::AND";
24509	case X86ISD::BEXTR: return "X86ISD::BEXTR";
24510	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24511	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24512	case X86ISD::PTEST: return "X86ISD::PTEST";
24513	case X86ISD::TESTP: return "X86ISD::TESTP";
24514	case X86ISD::TESTM: return "X86ISD::TESTM";
24515	case X86ISD::TESTNM: return "X86ISD::TESTNM";
24516	case X86ISD::KORTEST: return "X86ISD::KORTEST";
24517	case X86ISD::KTEST: return "X86ISD::KTEST";
24518	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24519	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24520	case X86ISD::PACKSS: return "X86ISD::PACKSS";
24521	case X86ISD::PACKUS: return "X86ISD::PACKUS";
24522	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24523	case X86ISD::VALIGN: return "X86ISD::VALIGN";
24524	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24525	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24526	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24527	case X86ISD::SHUFP: return "X86ISD::SHUFP";
24528	case X86ISD::SHUF128: return "X86ISD::SHUF128";
24529	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24530	case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24531	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24532	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24533	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24534	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24535	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24536	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24537	case X86ISD::MOVSD: return "X86ISD::MOVSD";
24538	case X86ISD::MOVSS: return "X86ISD::MOVSS";
24539	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24540	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24541	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24542	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24543	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24544	case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24545	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24546	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24547	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24548	case X86ISD::VPERMV: return "X86ISD::VPERMV";
24549	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24550	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24551	case X86ISD::VPERMI: return "X86ISD::VPERMI";
24552	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24553	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24554	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24555	case X86ISD::VRANGE: return "X86ISD::VRANGE";
24556	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24557	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24558	case X86ISD::PSADBW: return "X86ISD::PSADBW";
24559	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24560	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24561	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24562	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24563	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24564	case X86ISD::MFENCE: return "X86ISD::MFENCE";
24565	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24566	case X86ISD::SAHF: return "X86ISD::SAHF";
24567	case X86ISD::RDRAND: return "X86ISD::RDRAND";
24568	case X86ISD::RDSEED: return "X86ISD::RDSEED";
24569	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24570	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24571	case X86ISD::VPROT: return "X86ISD::VPROT";
24572	case X86ISD::VPROTI: return "X86ISD::VPROTI";
24573	case X86ISD::VPSHA: return "X86ISD::VPSHA";
24574	case X86ISD::VPSHL: return "X86ISD::VPSHL";
24575	case X86ISD::VPCOM: return "X86ISD::VPCOM";
24576	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24577	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24578	case X86ISD::FMADD: return "X86ISD::FMADD";
24579	case X86ISD::FMSUB: return "X86ISD::FMSUB";
24580	case X86ISD::FNMADD: return "X86ISD::FNMADD";
24581	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24582	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24583	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24584	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24585	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24586	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24587	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24588	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24589	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24590	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24591	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24592	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24593	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24594	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24595	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24596	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24597	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24598	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24599	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24600	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24601	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24602	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24603	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24604	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24605	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24606	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24607	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24608	case X86ISD::XTEST: return "X86ISD::XTEST";
24609	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24610	case X86ISD::EXPAND: return "X86ISD::EXPAND";
24611	case X86ISD::SELECT: return "X86ISD::SELECT";
24612	case X86ISD::SELECTS: return "X86ISD::SELECTS";
24613	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24614	case X86ISD::RCP28: return "X86ISD::RCP28";
24615	case X86ISD::RCP28S: return "X86ISD::RCP28S";
24616	case X86ISD::EXP2: return "X86ISD::EXP2";
24617	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24618	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24619	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24620	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24621	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24622	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24623	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24624	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24625	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24626	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24627	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24628	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24629	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24630	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24631	case X86ISD::SCALEF: return "X86ISD::SCALEF";
24632	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24633	case X86ISD::ADDS: return "X86ISD::ADDS";
24634	case X86ISD::SUBS: return "X86ISD::SUBS";
24635	case X86ISD::AVG: return "X86ISD::AVG";
24636	case X86ISD::MULHRS: return "X86ISD::MULHRS";
24637	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24638	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24639	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24640	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24641	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24642	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24643	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24644	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24645	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24646	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24647	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24648	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24649	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24650	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24651	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24652	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24653	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24654	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24655	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24656	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24657	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24658	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24659	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24660	case X86ISD::LWPINS: return "X86ISD::LWPINS";
24661	case X86ISD::MGATHER: return "X86ISD::MGATHER";
24662	}
24663	return nullptr;
24664	}
24665
24666	/// Return true if the addressing mode represented by AM is legal for this
24667	/// target, for a load/store of the specified type.
24668	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24669	const AddrMode &AM, Type *Ty,
24670	unsigned AS) const {
24671	// X86 supports extremely general addressing modes.
24672	CodeModel::Model M = getTargetMachine().getCodeModel();
24673
24674	// X86 allows a sign-extended 32-bit immediate field as a displacement.
24675	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24676	return false;
24677
24678	if (AM.BaseGV) {
24679	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24680
24681	// If a reference to this global requires an extra load, we can't fold it.
24682	if (isGlobalStubReference(GVFlags))
24683	return false;
24684
24685	// If BaseGV requires a register for the PIC base, we cannot also have a
24686	// BaseReg specified.
24687	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24688	return false;
24689
24690	// If lower 4G is not available, then we must use rip-relative addressing.
24691	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
24692	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
24693	return false;
24694	}
24695
24696	switch (AM.Scale) {
24697	case 0:
24698	case 1:
24699	case 2:
24700	case 4:
24701	case 8:
24702	// These scales always work.
24703	break;
24704	case 3:
24705	case 5:
24706	case 9:
24707	// These scales are formed with basereg+scalereg. Only accept if there is
24708	// no basereg yet.
24709	if (AM.HasBaseReg)
24710	return false;
24711	break;
24712	default: // Other stuff never works.
24713	return false;
24714	}
24715
24716	return true;
24717	}
24718
24719	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24720	unsigned Bits = Ty->getScalarSizeInBits();
24721
24722	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
24723	// particularly cheaper than those without.
24724	if (Bits == 8)
24725	return false;
24726
24727	// On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24728	// variable shifts just as cheap as scalar ones.
24729	if (Subtarget.hasInt256() && (Bits == 32 \|\| Bits == 64))
24730	return false;
24731
24732	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24733	// fully general vector.
24734	return true;
24735	}
24736
24737	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
24738	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
24739	return false;
24740	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24741	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24742	return NumBits1 > NumBits2;
24743	}
24744
24745	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
24746	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
24747	return false;
24748
24749	if (!isTypeLegal(EVT::getEVT(Ty1)))
24750	return false;
24751
24752	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop" ) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24752, __PRETTY_FUNCTION__));
24753
24754	// Assuming the caller doesn't have a zeroext or signext return parameter,
24755	// truncation all the way down to i1 is valid.
24756	return true;
24757	}
24758
24759	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24760	return isInt<32>(Imm);
24761	}
24762
24763	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24764	// Can also use sub to handle negated immediates.
24765	return isInt<32>(Imm);
24766	}
24767
24768	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24769	if (!VT1.isInteger() \|\| !VT2.isInteger())
24770	return false;
24771	unsigned NumBits1 = VT1.getSizeInBits();
24772	unsigned NumBits2 = VT2.getSizeInBits();
24773	return NumBits1 > NumBits2;
24774	}
24775
24776	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
24777	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24778	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24779	}
24780
24781	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24782	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24783	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24784	}
24785
24786	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24787	EVT VT1 = Val.getValueType();
24788	if (isZExtFree(VT1, VT2))
24789	return true;
24790
24791	if (Val.getOpcode() != ISD::LOAD)
24792	return false;
24793
24794	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
24795	!VT2.isSimple() \|\| !VT2.isInteger())
24796	return false;
24797
24798	switch (VT1.getSimpleVT().SimpleTy) {
24799	default: break;
24800	case MVT::i8:
24801	case MVT::i16:
24802	case MVT::i32:
24803	// X86 has 8, 16, and 32-bit zero-extending loads.
24804	return true;
24805	}
24806
24807	return false;
24808	}
24809
24810	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24811
24812	bool
24813	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24814	if (!Subtarget.hasAnyFMA())
24815	return false;
24816
24817	VT = VT.getScalarType();
24818
24819	if (!VT.isSimple())
24820	return false;
24821
24822	switch (VT.getSimpleVT().SimpleTy) {
24823	case MVT::f32:
24824	case MVT::f64:
24825	return true;
24826	default:
24827	break;
24828	}
24829
24830	return false;
24831	}
24832
24833	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24834	// i16 instructions are longer (0x66 prefix) and potentially slower.
24835	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24836	}
24837
24838	/// Targets can use this to indicate that they only support some
24839	/// VECTOR_SHUFFLE operations, those with specific masks.
24840	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24841	/// are assumed to be legal.
24842	bool
24843	X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24844	EVT VT) const {
24845	if (!VT.isSimple())
24846	return false;
24847
24848	// Not for i1 vectors
24849	if (VT.getSimpleVT().getScalarType() == MVT::i1)
24850	return false;
24851
24852	// Very little shuffling can be done for 64-bit vectors right now.
24853	if (VT.getSimpleVT().getSizeInBits() == 64)
24854	return false;
24855
24856	// We only care that the types being shuffled are legal. The lowering can
24857	// handle any possible shuffle mask that results.
24858	return isTypeLegal(VT.getSimpleVT());
24859	}
24860
24861	bool
24862	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24863	EVT VT) const {
24864	// Just delegate to the generic legality, clear masks aren't special.
24865	return isShuffleMaskLegal(Mask, VT);
24866	}
24867
24868	//===----------------------------------------------------------------------===//
24869	// X86 Scheduler Hooks
24870	//===----------------------------------------------------------------------===//
24871
24872	/// Utility function to emit xbegin specifying the start of an RTM region.
24873	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
24874	const TargetInstrInfo *TII) {
24875	DebugLoc DL = MI.getDebugLoc();
24876
24877	const BasicBlock *BB = MBB->getBasicBlock();
24878	MachineFunction::iterator I = ++MBB->getIterator();
24879
24880	// For the v = xbegin(), we generate
24881	//
24882	// thisMBB:
24883	// xbegin sinkMBB
24884	//
24885	// mainMBB:
24886	// s0 = -1
24887	//
24888	// fallBB:
24889	// eax = # XABORT_DEF
24890	// s1 = eax
24891	//
24892	// sinkMBB:
24893	// v = phi(s0/mainBB, s1/fallBB)
24894
24895	MachineBasicBlock *thisMBB = MBB;
24896	MachineFunction *MF = MBB->getParent();
24897	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24898	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24899	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24900	MF->insert(I, mainMBB);
24901	MF->insert(I, fallMBB);
24902	MF->insert(I, sinkMBB);
24903
24904	// Transfer the remainder of BB and its successor edges to sinkMBB.
24905	sinkMBB->splice(sinkMBB->begin(), MBB,
24906	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24907	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24908
24909	MachineRegisterInfo &MRI = MF->getRegInfo();
24910	unsigned DstReg = MI.getOperand(0).getReg();
24911	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24912	unsigned mainDstReg = MRI.createVirtualRegister(RC);
24913	unsigned fallDstReg = MRI.createVirtualRegister(RC);
24914
24915	// thisMBB:
24916	// xbegin fallMBB
24917	// # fallthrough to mainMBB
24918	// # abortion to fallMBB
24919	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24920	thisMBB->addSuccessor(mainMBB);
24921	thisMBB->addSuccessor(fallMBB);
24922
24923	// mainMBB:
24924	// mainDstReg := -1
24925	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24926	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24927	mainMBB->addSuccessor(sinkMBB);
24928
24929	// fallMBB:
24930	// ; pseudo instruction to model hardware's definition from XABORT
24931	// EAX := XABORT_DEF
24932	// fallDstReg := EAX
24933	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24934	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24935	.addReg(X86::EAX);
24936	fallMBB->addSuccessor(sinkMBB);
24937
24938	// sinkMBB:
24939	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24940	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24941	.addReg(mainDstReg).addMBB(mainMBB)
24942	.addReg(fallDstReg).addMBB(fallMBB);
24943
24944	MI.eraseFromParent();
24945	return sinkMBB;
24946	}
24947
24948	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24949	// or XMM0_V32I8 in AVX all of this code can be replaced with that
24950	// in the .td file.
24951	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
24952	const TargetInstrInfo *TII) {
24953	unsigned Opc;
24954	switch (MI.getOpcode()) {
24955	default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24955);
24956	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24957	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24958	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24959	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24960	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24961	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24962	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24963	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24964	}
24965
24966	DebugLoc dl = MI.getDebugLoc();
24967	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24968
24969	unsigned NumArgs = MI.getNumOperands();
24970	for (unsigned i = 1; i < NumArgs; ++i) {
24971	MachineOperand &Op = MI.getOperand(i);
24972	if (!(Op.isReg() && Op.isImplicit()))
24973	MIB.add(Op);
24974	}
24975	if (MI.hasOneMemOperand())
24976	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24977
24978	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24979	.addReg(X86::XMM0);
24980
24981	MI.eraseFromParent();
24982	return BB;
24983	}
24984
24985	// FIXME: Custom handling because TableGen doesn't support multiple implicit
24986	// defs in an instruction pattern
24987	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
24988	const TargetInstrInfo *TII) {
24989	unsigned Opc;
24990	switch (MI.getOpcode()) {
24991	default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 24991);
24992	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24993	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24994	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24995	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24996	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24997	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24998	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24999	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25000	}
25001
25002	DebugLoc dl = MI.getDebugLoc();
25003	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25004
25005	unsigned NumArgs = MI.getNumOperands(); // remove the results
25006	for (unsigned i = 1; i < NumArgs; ++i) {
25007	MachineOperand &Op = MI.getOperand(i);
25008	if (!(Op.isReg() && Op.isImplicit()))
25009	MIB.add(Op);
25010	}
25011	if (MI.hasOneMemOperand())
25012	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25013
25014	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25015	.addReg(X86::ECX);
25016
25017	MI.eraseFromParent();
25018	return BB;
25019	}
25020
25021	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
25022	const X86Subtarget &Subtarget) {
25023	DebugLoc dl = MI.getDebugLoc();
25024	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25025
25026	// insert input VAL into EAX
25027	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25028	.addReg(MI.getOperand(0).getReg());
25029	// insert zero to ECX
25030	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25031
25032	// insert zero to EDX
25033	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25034
25035	// insert WRPKRU instruction
25036	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25037
25038	MI.eraseFromParent(); // The pseudo is gone now.
25039	return BB;
25040	}
25041
25042	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
25043	const X86Subtarget &Subtarget) {
25044	DebugLoc dl = MI.getDebugLoc();
25045	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25046
25047	// insert zero to ECX
25048	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25049
25050	// insert RDPKRU instruction
25051	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25052	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25053	.addReg(X86::EAX);
25054
25055	MI.eraseFromParent(); // The pseudo is gone now.
25056	return BB;
25057	}
25058
25059	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
25060	const X86Subtarget &Subtarget,
25061	unsigned Opc) {
25062	DebugLoc dl = MI.getDebugLoc();
25063	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25064	// Address into RAX/EAX, other two args into ECX, EDX.
25065	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25066	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25067	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25068	for (int i = 0; i < X86::AddrNumOperands; ++i)
25069	MIB.add(MI.getOperand(i));
25070
25071	unsigned ValOps = X86::AddrNumOperands;
25072	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25073	.addReg(MI.getOperand(ValOps).getReg());
25074	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25075	.addReg(MI.getOperand(ValOps + 1).getReg());
25076
25077	// The instruction doesn't actually take any operands though.
25078	BuildMI(*BB, MI, dl, TII->get(Opc));
25079
25080	MI.eraseFromParent(); // The pseudo is gone now.
25081	return BB;
25082	}
25083
25084	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
25085	const X86Subtarget &Subtarget) {
25086	DebugLoc dl = MI->getDebugLoc();
25087	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25088	// Address into RAX/EAX
25089	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25090	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25091	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25092	for (int i = 0; i < X86::AddrNumOperands; ++i)
25093	MIB.add(MI->getOperand(i));
25094
25095	// The instruction doesn't actually take any operands though.
25096	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25097
25098	MI->eraseFromParent(); // The pseudo is gone now.
25099	return BB;
25100	}
25101
25102
25103
25104	MachineBasicBlock *
25105	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25106	MachineBasicBlock *MBB) const {
25107	// Emit va_arg instruction on X86-64.
25108
25109	// Operands to this pseudo-instruction:
25110	// 0 ) Output : destination address (reg)
25111	// 1-5) Input : va_list address (addr, i64mem)
25112	// 6 ) ArgSize : Size (in bytes) of vararg type
25113	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25114	// 8 ) Align : Alignment of type
25115	// 9 ) EFLAGS (implicit-def)
25116
25117	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!" ) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25117, __PRETTY_FUNCTION__));
25118	static_assert(X86::AddrNumOperands == 5,
25119	"VAARG_64 assumes 5 address operands");
25120
25121	unsigned DestReg = MI.getOperand(0).getReg();
25122	MachineOperand &Base = MI.getOperand(1);
25123	MachineOperand &Scale = MI.getOperand(2);
25124	MachineOperand &Index = MI.getOperand(3);
25125	MachineOperand &Disp = MI.getOperand(4);
25126	MachineOperand &Segment = MI.getOperand(5);
25127	unsigned ArgSize = MI.getOperand(6).getImm();
25128	unsigned ArgMode = MI.getOperand(7).getImm();
25129	unsigned Align = MI.getOperand(8).getImm();
25130
25131	// Memory Reference
25132	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand" ) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25132, __PRETTY_FUNCTION__));
25133	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25134	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25135
25136	// Machine Information
25137	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25138	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25139	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25140	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25141	DebugLoc DL = MI.getDebugLoc();
25142
25143	// struct va_list {
25144	// i32 gp_offset
25145	// i32 fp_offset
25146	// i64 overflow_area (address)
25147	// i64 reg_save_area (address)
25148	// }
25149	// sizeof(va_list) = 24
25150	// alignment(va_list) = 8
25151
25152	unsigned TotalNumIntRegs = 6;
25153	unsigned TotalNumXMMRegs = 8;
25154	bool UseGPOffset = (ArgMode == 1);
25155	bool UseFPOffset = (ArgMode == 2);
25156	unsigned MaxOffset = TotalNumIntRegs * 8 +
25157	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25158
25159	/* Align ArgSize to a multiple of 8 */
25160	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25161	bool NeedsAlign = (Align > 8);
25162
25163	MachineBasicBlock *thisMBB = MBB;
25164	MachineBasicBlock *overflowMBB;
25165	MachineBasicBlock *offsetMBB;
25166	MachineBasicBlock *endMBB;
25167
25168	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25169	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25170	unsigned OffsetReg = 0;
25171
25172	if (!UseGPOffset && !UseFPOffset) {
25173	// If we only pull from the overflow region, we don't create a branch.
25174	// We don't need to alter control flow.
25175	OffsetDestReg = 0; // unused
25176	OverflowDestReg = DestReg;
25177
25178	offsetMBB = nullptr;
25179	overflowMBB = thisMBB;
25180	endMBB = thisMBB;
25181	} else {
25182	// First emit code to check if gp_offset (or fp_offset) is below the bound.
25183	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
25184	// If not, pull from overflow_area. (branch to overflowMBB)
25185	//
25186	// thisMBB
25187	// \| .
25188	// \| .
25189	// offsetMBB overflowMBB
25190	// \| .
25191	// \| .
25192	// endMBB
25193
25194	// Registers for the PHI in endMBB
25195	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25196	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25197
25198	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25199	MachineFunction *MF = MBB->getParent();
25200	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25201	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25202	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25203
25204	MachineFunction::iterator MBBIter = ++MBB->getIterator();
25205
25206	// Insert the new basic blocks
25207	MF->insert(MBBIter, offsetMBB);
25208	MF->insert(MBBIter, overflowMBB);
25209	MF->insert(MBBIter, endMBB);
25210
25211	// Transfer the remainder of MBB and its successor edges to endMBB.
25212	endMBB->splice(endMBB->begin(), thisMBB,
25213	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25214	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25215
25216	// Make offsetMBB and overflowMBB successors of thisMBB
25217	thisMBB->addSuccessor(offsetMBB);
25218	thisMBB->addSuccessor(overflowMBB);
25219
25220	// endMBB is a successor of both offsetMBB and overflowMBB
25221	offsetMBB->addSuccessor(endMBB);
25222	overflowMBB->addSuccessor(endMBB);
25223
25224	// Load the offset value into a register
25225	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25226	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25227	.add(Base)
25228	.add(Scale)
25229	.add(Index)
25230	.addDisp(Disp, UseFPOffset ? 4 : 0)
25231	.add(Segment)
25232	.setMemRefs(MMOBegin, MMOEnd);
25233
25234	// Check if there is enough room left to pull this argument.
25235	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25236	.addReg(OffsetReg)
25237	.addImm(MaxOffset + 8 - ArgSizeA8);
25238
25239	// Branch to "overflowMBB" if offset >= max
25240	// Fall through to "offsetMBB" otherwise
25241	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25242	.addMBB(overflowMBB);
25243	}
25244
25245	// In offsetMBB, emit code to use the reg_save_area.
25246	if (offsetMBB) {
25247	assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail ("OffsetReg != 0", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25247, __PRETTY_FUNCTION__));
25248
25249	// Read the reg_save_area address.
25250	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25251	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25252	.add(Base)
25253	.add(Scale)
25254	.add(Index)
25255	.addDisp(Disp, 16)
25256	.add(Segment)
25257	.setMemRefs(MMOBegin, MMOEnd);
25258
25259	// Zero-extend the offset
25260	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25261	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25262	.addImm(0)
25263	.addReg(OffsetReg)
25264	.addImm(X86::sub_32bit);
25265
25266	// Add the offset to the reg_save_area to get the final address.
25267	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25268	.addReg(OffsetReg64)
25269	.addReg(RegSaveReg);
25270
25271	// Compute the offset for the next argument
25272	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25273	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25274	.addReg(OffsetReg)
25275	.addImm(UseFPOffset ? 16 : 8);
25276
25277	// Store it back into the va_list.
25278	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25279	.add(Base)
25280	.add(Scale)
25281	.add(Index)
25282	.addDisp(Disp, UseFPOffset ? 4 : 0)
25283	.add(Segment)
25284	.addReg(NextOffsetReg)
25285	.setMemRefs(MMOBegin, MMOEnd);
25286
25287	// Jump to endMBB
25288	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25289	.addMBB(endMBB);
25290	}
25291
25292	//
25293	// Emit code to use overflow area
25294	//
25295
25296	// Load the overflow_area address into a register.
25297	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25298	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25299	.add(Base)
25300	.add(Scale)
25301	.add(Index)
25302	.addDisp(Disp, 8)
25303	.add(Segment)
25304	.setMemRefs(MMOBegin, MMOEnd);
25305
25306	// If we need to align it, do so. Otherwise, just copy the address
25307	// to OverflowDestReg.
25308	if (NeedsAlign) {
25309	// Align the overflow address
25310	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")((isPowerOf2_32(Align) && "Alignment must be a power of 2" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25310, __PRETTY_FUNCTION__));
25311	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25312
25313	// aligned_addr = (addr + (align-1)) & ~(align-1)
25314	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25315	.addReg(OverflowAddrReg)
25316	.addImm(Align-1);
25317
25318	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25319	.addReg(TmpReg)
25320	.addImm(~(uint64_t)(Align-1));
25321	} else {
25322	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25323	.addReg(OverflowAddrReg);
25324	}
25325
25326	// Compute the next overflow address after this argument.
25327	// (the overflow address should be kept 8-byte aligned)
25328	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25329	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25330	.addReg(OverflowDestReg)
25331	.addImm(ArgSizeA8);
25332
25333	// Store the new overflow address.
25334	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25335	.add(Base)
25336	.add(Scale)
25337	.add(Index)
25338	.addDisp(Disp, 8)
25339	.add(Segment)
25340	.addReg(NextAddrReg)
25341	.setMemRefs(MMOBegin, MMOEnd);
25342
25343	// If we branched, emit the PHI to the front of endMBB.
25344	if (offsetMBB) {
25345	BuildMI(*endMBB, endMBB->begin(), DL,
25346	TII->get(X86::PHI), DestReg)
25347	.addReg(OffsetDestReg).addMBB(offsetMBB)
25348	.addReg(OverflowDestReg).addMBB(overflowMBB);
25349	}
25350
25351	// Erase the pseudo instruction
25352	MI.eraseFromParent();
25353
25354	return endMBB;
25355	}
25356
25357	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25358	MachineInstr &MI, MachineBasicBlock *MBB) const {
25359	// Emit code to save XMM registers to the stack. The ABI says that the
25360	// number of registers to save is given in %al, so it's theoretically
25361	// possible to do an indirect jump trick to avoid saving all of them,
25362	// however this code takes a simpler approach and just executes all
25363	// of the stores if %al is non-zero. It's less code, and it's probably
25364	// easier on the hardware branch predictor, and stores aren't all that
25365	// expensive anyway.
25366
25367	// Create the new basic blocks. One block contains all the XMM stores,
25368	// and one block is the final destination regardless of whether any
25369	// stores were performed.
25370	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25371	MachineFunction *F = MBB->getParent();
25372	MachineFunction::iterator MBBIter = ++MBB->getIterator();
25373	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25374	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25375	F->insert(MBBIter, XMMSaveMBB);
25376	F->insert(MBBIter, EndMBB);
25377
25378	// Transfer the remainder of MBB and its successor edges to EndMBB.
25379	EndMBB->splice(EndMBB->begin(), MBB,
25380	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25381	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25382
25383	// The original block will now fall through to the XMM save block.
25384	MBB->addSuccessor(XMMSaveMBB);
25385	// The XMMSaveMBB will fall through to the end block.
25386	XMMSaveMBB->addSuccessor(EndMBB);
25387
25388	// Now add the instructions.
25389	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25390	DebugLoc DL = MI.getDebugLoc();
25391
25392	unsigned CountReg = MI.getOperand(0).getReg();
25393	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25394	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25395
25396	if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25397	// If %al is 0, branch around the XMM save block.
25398	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25399	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25400	MBB->addSuccessor(EndMBB);
25401	}
25402
25403	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
25404	// that was just emitted, but clearly shouldn't be "saved".
25405	assert((MI.getNumOperands() <= 3 \|\|(((MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands () - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg () == X86::EFLAGS) && "Expected last argument to be EFLAGS" ) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands() - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25408, __PRETTY_FUNCTION__))
25406	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|(((MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands () - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg () == X86::EFLAGS) && "Expected last argument to be EFLAGS" ) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands() - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25408, __PRETTY_FUNCTION__))
25407	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands () - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg () == X86::EFLAGS) && "Expected last argument to be EFLAGS" ) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands() - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25408, __PRETTY_FUNCTION__))
25408	"Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands () - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg () == X86::EFLAGS) && "Expected last argument to be EFLAGS" ) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 \|\| !MI.getOperand(MI.getNumOperands() - 1).isReg() \|\| MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25408, __PRETTY_FUNCTION__));
25409	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25410	// In the XMM save block, save all the XMM argument registers.
25411	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25412	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25413	MachineMemOperand *MMO = F->getMachineMemOperand(
25414	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25415	MachineMemOperand::MOStore,
25416	/Size=/16, /Align=/16);
25417	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25418	.addFrameIndex(RegSaveFrameIndex)
25419	.addImm(/Scale=/1)
25420	.addReg(/IndexReg=/0)
25421	.addImm(/Disp=/Offset)
25422	.addReg(/Segment=/0)
25423	.addReg(MI.getOperand(i).getReg())
25424	.addMemOperand(MMO);
25425	}
25426
25427	MI.eraseFromParent(); // The pseudo instruction is gone now.
25428
25429	return EndMBB;
25430	}
25431
25432	// The EFLAGS operand of SelectItr might be missing a kill marker
25433	// because there were multiple uses of EFLAGS, and ISel didn't know
25434	// which to mark. Figure out whether SelectItr should have had a
25435	// kill marker, and set it if it should. Returns the correct kill
25436	// marker value.
25437	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25438	MachineBasicBlock* BB,
25439	const TargetRegisterInfo* TRI) {
25440	// Scan forward through BB for a use/def of EFLAGS.
25441	MachineBasicBlock::iterator miI(std::next(SelectItr));
25442	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25443	const MachineInstr& mi = *miI;
25444	if (mi.readsRegister(X86::EFLAGS))
25445	return false;
25446	if (mi.definesRegister(X86::EFLAGS))
25447	break; // Should have kill-flag - update below.
25448	}
25449
25450	// If we hit the end of the block, check whether EFLAGS is live into a
25451	// successor.
25452	if (miI == BB->end()) {
25453	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25454	sEnd = BB->succ_end();
25455	sItr != sEnd; ++sItr) {
25456	MachineBasicBlock* succ = *sItr;
25457	if (succ->isLiveIn(X86::EFLAGS))
25458	return false;
25459	}
25460	}
25461
25462	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
25463	// out. SelectMI should have a kill flag on EFLAGS.
25464	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25465	return true;
25466	}
25467
25468	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25469	// together with other CMOV pseudo-opcodes into a single basic-block with
25470	// conditional jump around it.
25471	static bool isCMOVPseudo(MachineInstr &MI) {
25472	switch (MI.getOpcode()) {
25473	case X86::CMOV_FR32:
25474	case X86::CMOV_FR64:
25475	case X86::CMOV_GR8:
25476	case X86::CMOV_GR16:
25477	case X86::CMOV_GR32:
25478	case X86::CMOV_RFP32:
25479	case X86::CMOV_RFP64:
25480	case X86::CMOV_RFP80:
25481	case X86::CMOV_V2F64:
25482	case X86::CMOV_V2I64:
25483	case X86::CMOV_V4F32:
25484	case X86::CMOV_V4F64:
25485	case X86::CMOV_V4I64:
25486	case X86::CMOV_V16F32:
25487	case X86::CMOV_V8F32:
25488	case X86::CMOV_V8F64:
25489	case X86::CMOV_V8I64:
25490	case X86::CMOV_V8I1:
25491	case X86::CMOV_V16I1:
25492	case X86::CMOV_V32I1:
25493	case X86::CMOV_V64I1:
25494	return true;
25495
25496	default:
25497	return false;
25498	}
25499	}
25500
25501	MachineBasicBlock *
25502	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25503	MachineBasicBlock *BB) const {
25504	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25505	DebugLoc DL = MI.getDebugLoc();
25506
25507	// To "insert" a SELECT_CC instruction, we actually have to insert the
25508	// diamond control-flow pattern. The incoming instruction knows the
25509	// destination vreg to set, the condition code register to branch on, the
25510	// true/false values to select between, and a branch opcode to use.
25511	const BasicBlock *LLVM_BB = BB->getBasicBlock();
25512	MachineFunction::iterator It = ++BB->getIterator();
25513
25514	// thisMBB:
25515	// ...
25516	// TrueVal = ...
25517	// cmpTY ccX, r1, r2
25518	// bCC copy1MBB
25519	// fallthrough --> copy0MBB
25520	MachineBasicBlock *thisMBB = BB;
25521	MachineFunction *F = BB->getParent();
25522
25523	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
25524	// as described above, by inserting a BB, and then making a PHI at the join
25525	// point to select the true and false operands of the CMOV in the PHI.
25526	//
25527	// The code also handles two different cases of multiple CMOV opcodes
25528	// in a row.
25529	//
25530	// Case 1:
25531	// In this case, there are multiple CMOVs in a row, all which are based on
25532	// the same condition setting (or the exact opposite condition setting).
25533	// In this case we can lower all the CMOVs using a single inserted BB, and
25534	// then make a number of PHIs at the join point to model the CMOVs. The only
25535	// trickiness here, is that in a case like:
25536	//
25537	// t2 = CMOV cond1 t1, f1
25538	// t3 = CMOV cond1 t2, f2
25539	//
25540	// when rewriting this into PHIs, we have to perform some renaming on the
25541	// temps since you cannot have a PHI operand refer to a PHI result earlier
25542	// in the same block. The "simple" but wrong lowering would be:
25543	//
25544	// t2 = PHI t1(BB1), f1(BB2)
25545	// t3 = PHI t2(BB1), f2(BB2)
25546	//
25547	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
25548	// renaming is to note that on the path through BB1, t2 is really just a
25549	// copy of t1, and do that renaming, properly generating:
25550	//
25551	// t2 = PHI t1(BB1), f1(BB2)
25552	// t3 = PHI t1(BB1), f2(BB2)
25553	//
25554	// Case 2, we lower cascaded CMOVs such as
25555	//
25556	// (CMOV (CMOV F, T, cc1), T, cc2)
25557	//
25558	// to two successive branches. For that, we look for another CMOV as the
25559	// following instruction.
25560	//
25561	// Without this, we would add a PHI between the two jumps, which ends up
25562	// creating a few copies all around. For instance, for
25563	//
25564	// (sitofp (zext (fcmp une)))
25565	//
25566	// we would generate:
25567	//
25568	// ucomiss %xmm1, %xmm0
25569	// movss <1.0f>, %xmm0
25570	// movaps %xmm0, %xmm1
25571	// jne .LBB5_2
25572	// xorps %xmm1, %xmm1
25573	// .LBB5_2:
25574	// jp .LBB5_4
25575	// movaps %xmm1, %xmm0
25576	// .LBB5_4:
25577	// retq
25578	//
25579	// because this custom-inserter would have generated:
25580	//
25581	// A
25582	// \| \
25583	// \| B
25584	// \| /
25585	// C
25586	// \| \
25587	// \| D
25588	// \| /
25589	// E
25590	//
25591	// A: X = ...; Y = ...
25592	// B: empty
25593	// C: Z = PHI [X, A], [Y, B]
25594	// D: empty
25595	// E: PHI [X, C], [Z, D]
25596	//
25597	// If we lower both CMOVs in a single step, we can instead generate:
25598	//
25599	// A
25600	// \| \
25601	// \| C
25602	// \| /\|
25603	// \|/ \|
25604	// \| \|
25605	// \| D
25606	// \| /
25607	// E
25608	//
25609	// A: X = ...; Y = ...
25610	// D: empty
25611	// E: PHI [X, A], [X, C], [Y, D]
25612	//
25613	// Which, in our sitofp/fcmp example, gives us something like:
25614	//
25615	// ucomiss %xmm1, %xmm0
25616	// movss <1.0f>, %xmm0
25617	// jne .LBB5_4
25618	// jp .LBB5_4
25619	// xorps %xmm0, %xmm0
25620	// .LBB5_4:
25621	// retq
25622	//
25623	MachineInstr *CascadedCMOV = nullptr;
25624	MachineInstr *LastCMOV = &MI;
25625	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25626	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25627	MachineBasicBlock::iterator NextMIIt =
25628	std::next(MachineBasicBlock::iterator(MI));
25629
25630	// Check for case 1, where there are multiple CMOVs with the same condition
25631	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25632	// number of jumps the most.
25633
25634	if (isCMOVPseudo(MI)) {
25635	// See if we have a string of CMOVS with the same condition.
25636	while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25637	(NextMIIt->getOperand(3).getImm() == CC \|\|
25638	NextMIIt->getOperand(3).getImm() == OppCC)) {
25639	LastCMOV = &*NextMIIt;
25640	++NextMIIt;
25641	}
25642	}
25643
25644	// This checks for case 2, but only do this if we didn't already find
25645	// case 1, as indicated by LastCMOV == MI.
25646	if (LastCMOV == &MI && NextMIIt != BB->end() &&
25647	NextMIIt->getOpcode() == MI.getOpcode() &&
25648	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25649	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25650	NextMIIt->getOperand(1).isKill()) {
25651	CascadedCMOV = &*NextMIIt;
25652	}
25653
25654	MachineBasicBlock *jcc1MBB = nullptr;
25655
25656	// If we have a cascaded CMOV, we lower it to two successive branches to
25657	// the same block. EFLAGS is used by both, so mark it as live in the second.
25658	if (CascadedCMOV) {
25659	jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25660	F->insert(It, jcc1MBB);
25661	jcc1MBB->addLiveIn(X86::EFLAGS);
25662	}
25663
25664	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25665	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25666	F->insert(It, copy0MBB);
25667	F->insert(It, sinkMBB);
25668
25669	// If the EFLAGS register isn't dead in the terminator, then claim that it's
25670	// live into the sink and copy blocks.
25671	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25672
25673	MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25674	if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25675	!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25676	copy0MBB->addLiveIn(X86::EFLAGS);
25677	sinkMBB->addLiveIn(X86::EFLAGS);
25678	}
25679
25680	// Transfer the remainder of BB and its successor edges to sinkMBB.
25681	sinkMBB->splice(sinkMBB->begin(), BB,
25682	std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25683	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25684
25685	// Add the true and fallthrough blocks as its successors.
25686	if (CascadedCMOV) {
25687	// The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25688	BB->addSuccessor(jcc1MBB);
25689
25690	// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25691	// jump to the sinkMBB.
25692	jcc1MBB->addSuccessor(copy0MBB);
25693	jcc1MBB->addSuccessor(sinkMBB);
25694	} else {
25695	BB->addSuccessor(copy0MBB);
25696	}
25697
25698	// The true block target of the first (or only) branch is always sinkMBB.
25699	BB->addSuccessor(sinkMBB);
25700
25701	// Create the conditional branch instruction.
25702	unsigned Opc = X86::GetCondBranchFromCond(CC);
25703	BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25704
25705	if (CascadedCMOV) {
25706	unsigned Opc2 = X86::GetCondBranchFromCond(
25707	(X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25708	BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25709	}
25710
25711	// copy0MBB:
25712	// %FalseValue = ...
25713	// # fallthrough to sinkMBB
25714	copy0MBB->addSuccessor(sinkMBB);
25715
25716	// sinkMBB:
25717	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25718	// ...
25719	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25720	MachineBasicBlock::iterator MIItEnd =
25721	std::next(MachineBasicBlock::iterator(LastCMOV));
25722	MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25723	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25724	MachineInstrBuilder MIB;
25725
25726	// As we are creating the PHIs, we have to be careful if there is more than
25727	// one. Later CMOVs may reference the results of earlier CMOVs, but later
25728	// PHIs have to reference the individual true/false inputs from earlier PHIs.
25729	// That also means that PHI construction must work forward from earlier to
25730	// later, and that the code must maintain a mapping from earlier PHI's
25731	// destination registers, and the registers that went into the PHI.
25732
25733	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25734	unsigned DestReg = MIIt->getOperand(0).getReg();
25735	unsigned Op1Reg = MIIt->getOperand(1).getReg();
25736	unsigned Op2Reg = MIIt->getOperand(2).getReg();
25737
25738	// If this CMOV we are generating is the opposite condition from
25739	// the jump we generated, then we have to swap the operands for the
25740	// PHI that is going to be generated.
25741	if (MIIt->getOperand(3).getImm() == OppCC)
25742	std::swap(Op1Reg, Op2Reg);
25743
25744	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25745	Op1Reg = RegRewriteTable[Op1Reg].first;
25746
25747	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25748	Op2Reg = RegRewriteTable[Op2Reg].second;
25749
25750	MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25751	TII->get(X86::PHI), DestReg)
25752	.addReg(Op1Reg).addMBB(copy0MBB)
25753	.addReg(Op2Reg).addMBB(thisMBB);
25754
25755	// Add this PHI to the rewrite table.
25756	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25757	}
25758
25759	// If we have a cascaded CMOV, the second Jcc provides the same incoming
25760	// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25761	if (CascadedCMOV) {
25762	MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25763	// Copy the PHI result to the register defined by the second CMOV.
25764	BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25765	DL, TII->get(TargetOpcode::COPY),
25766	CascadedCMOV->getOperand(0).getReg())
25767	.addReg(MI.getOperand(0).getReg());
25768	CascadedCMOV->eraseFromParent();
25769	}
25770
25771	// Now remove the CMOV(s).
25772	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25773	(MIIt++)->eraseFromParent();
25774
25775	return sinkMBB;
25776	}
25777
25778	MachineBasicBlock *
25779	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25780	MachineBasicBlock *BB) const {
25781	// Combine the following atomic floating-point modification pattern:
25782	// a.store(reg OP a.load(acquire), release)
25783	// Transform them into:
25784	// OPss (%gpr), %xmm
25785	// movss %xmm, (%gpr)
25786	// Or sd equivalent for 64-bit operations.
25787	unsigned MOp, FOp;
25788	switch (MI.getOpcode()) {
25789	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP")::llvm::llvm_unreachable_internal("unexpected instr type for EmitLoweredAtomicFP" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25789);
25790	case X86::RELEASE_FADD32mr:
25791	FOp = X86::ADDSSrm;
25792	MOp = X86::MOVSSmr;
25793	break;
25794	case X86::RELEASE_FADD64mr:
25795	FOp = X86::ADDSDrm;
25796	MOp = X86::MOVSDmr;
25797	break;
25798	}
25799	const X86InstrInfo *TII = Subtarget.getInstrInfo();
25800	DebugLoc DL = MI.getDebugLoc();
25801	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25802	unsigned ValOpIdx = X86::AddrNumOperands;
25803	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25804	MachineInstrBuilder MIB =
25805	BuildMI(*BB, MI, DL, TII->get(FOp),
25806	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25807	.addReg(VSrc);
25808	for (int i = 0; i < X86::AddrNumOperands; ++i) {
25809	MachineOperand &Operand = MI.getOperand(i);
25810	// Clear any kill flags on register operands as we'll create a second
25811	// instruction using the same address operands.
25812	if (Operand.isReg())
25813	Operand.setIsKill(false);
25814	MIB.add(Operand);
25815	}
25816	MachineInstr *FOpMI = MIB;
25817	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25818	for (int i = 0; i < X86::AddrNumOperands; ++i)
25819	MIB.add(MI.getOperand(i));
25820	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25821	MI.eraseFromParent(); // The pseudo instruction is gone now.
25822	return BB;
25823	}
25824
25825	MachineBasicBlock *
25826	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25827	MachineBasicBlock *BB) const {
25828	MachineFunction *MF = BB->getParent();
25829	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25830	DebugLoc DL = MI.getDebugLoc();
25831	const BasicBlock *LLVM_BB = BB->getBasicBlock();
25832
25833	assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail ("MF->shouldSplitStack()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25833, __PRETTY_FUNCTION__));
25834
25835	const bool Is64Bit = Subtarget.is64Bit();
25836	const bool IsLP64 = Subtarget.isTarget64BitLP64();
25837
25838	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25839	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25840
25841	// BB:
25842	// ... [Till the alloca]
25843	// If stacklet is not large enough, jump to mallocMBB
25844	//
25845	// bumpMBB:
25846	// Allocate by subtracting from RSP
25847	// Jump to continueMBB
25848	//
25849	// mallocMBB:
25850	// Allocate by call to runtime
25851	//
25852	// continueMBB:
25853	// ...
25854	// [rest of original BB]
25855	//
25856
25857	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25858	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25859	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25860
25861	MachineRegisterInfo &MRI = MF->getRegInfo();
25862	const TargetRegisterClass *AddrRegClass =
25863	getRegClassFor(getPointerTy(MF->getDataLayout()));
25864
25865	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25866	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25867	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25868	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25869	sizeVReg = MI.getOperand(1).getReg(),
25870	physSPReg =
25871	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25872
25873	MachineFunction::iterator MBBIter = ++BB->getIterator();
25874
25875	MF->insert(MBBIter, bumpMBB);
25876	MF->insert(MBBIter, mallocMBB);
25877	MF->insert(MBBIter, continueMBB);
25878
25879	continueMBB->splice(continueMBB->begin(), BB,
25880	std::next(MachineBasicBlock::iterator(MI)), BB->end());
25881	continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25882
25883	// Add code to the main basic block to check if the stack limit has been hit,
25884	// and if so, jump to mallocMBB otherwise to bumpMBB.
25885	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25886	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25887	.addReg(tmpSPVReg).addReg(sizeVReg);
25888	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25889	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25890	.addReg(SPLimitVReg);
25891	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25892
25893	// bumpMBB simply decreases the stack pointer, since we know the current
25894	// stacklet has enough space.
25895	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25896	.addReg(SPLimitVReg);
25897	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25898	.addReg(SPLimitVReg);
25899	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25900
25901	// Calls into a routine in libgcc to allocate more space from the heap.
25902	const uint32_t *RegMask =
25903	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25904	if (IsLP64) {
25905	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25906	.addReg(sizeVReg);
25907	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25908	.addExternalSymbol("__morestack_allocate_stack_space")
25909	.addRegMask(RegMask)
25910	.addReg(X86::RDI, RegState::Implicit)
25911	.addReg(X86::RAX, RegState::ImplicitDefine);
25912	} else if (Is64Bit) {
25913	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25914	.addReg(sizeVReg);
25915	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25916	.addExternalSymbol("__morestack_allocate_stack_space")
25917	.addRegMask(RegMask)
25918	.addReg(X86::EDI, RegState::Implicit)
25919	.addReg(X86::EAX, RegState::ImplicitDefine);
25920	} else {
25921	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25922	.addImm(12);
25923	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25924	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25925	.addExternalSymbol("__morestack_allocate_stack_space")
25926	.addRegMask(RegMask)
25927	.addReg(X86::EAX, RegState::ImplicitDefine);
25928	}
25929
25930	if (!Is64Bit)
25931	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25932	.addImm(16);
25933
25934	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25935	.addReg(IsLP64 ? X86::RAX : X86::EAX);
25936	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25937
25938	// Set up the CFG correctly.
25939	BB->addSuccessor(bumpMBB);
25940	BB->addSuccessor(mallocMBB);
25941	mallocMBB->addSuccessor(continueMBB);
25942	bumpMBB->addSuccessor(continueMBB);
25943
25944	// Take care of the PHI nodes.
25945	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25946	MI.getOperand(0).getReg())
25947	.addReg(mallocPtrVReg)
25948	.addMBB(mallocMBB)
25949	.addReg(bumpSPPtrVReg)
25950	.addMBB(bumpMBB);
25951
25952	// Delete the original pseudo instruction.
25953	MI.eraseFromParent();
25954
25955	// And we're done.
25956	return continueMBB;
25957	}
25958
25959	MachineBasicBlock *
25960	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25961	MachineBasicBlock *BB) const {
25962	MachineFunction *MF = BB->getParent();
25963	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25964	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25965	DebugLoc DL = MI.getDebugLoc();
25966
25967	assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF-> getFunction()->getPersonalityFn())) && "SEH does not use catchret!" ) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25969, __PRETTY_FUNCTION__))
25968	classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF-> getFunction()->getPersonalityFn())) && "SEH does not use catchret!" ) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25969, __PRETTY_FUNCTION__))
25969	"SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF-> getFunction()->getPersonalityFn())) && "SEH does not use catchret!" ) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25969, __PRETTY_FUNCTION__));
25970
25971	// Only 32-bit EH needs to worry about manually restoring stack pointers.
25972	if (!Subtarget.is32Bit())
25973	return BB;
25974
25975	// C++ EH creates a new target block to hold the restore code, and wires up
25976	// the new block to the return destination with a normal JMP_4.
25977	MachineBasicBlock *RestoreMBB =
25978	MF->CreateMachineBasicBlock(BB->getBasicBlock());
25979	assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail ("BB->succ_size() == 1", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 25979, __PRETTY_FUNCTION__));
25980	MF->insert(std::next(BB->getIterator()), RestoreMBB);
25981	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25982	BB->addSuccessor(RestoreMBB);
25983	MI.getOperand(0).setMBB(RestoreMBB);
25984
25985	auto RestoreMBBI = RestoreMBB->begin();
25986	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25987	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25988	return BB;
25989	}
25990
25991	MachineBasicBlock *
25992	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25993	MachineBasicBlock *BB) const {
25994	MachineFunction *MF = BB->getParent();
25995	const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25996	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25997	// Only 32-bit SEH requires special handling for catchpad.
25998	if (IsSEH && Subtarget.is32Bit()) {
25999	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26000	DebugLoc DL = MI.getDebugLoc();
26001	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26002	}
26003	MI.eraseFromParent();
26004	return BB;
26005	}
26006
26007	MachineBasicBlock *
26008	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26009	MachineBasicBlock *BB) const {
26010	// So, here we replace TLSADDR with the sequence:
26011	// adjust_stackdown -> TLSADDR -> adjust_stackup.
26012	// We need this because TLSADDR is lowered into calls
26013	// inside MC, therefore without the two markers shrink-wrapping
26014	// may push the prologue/epilogue pass them.
26015	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26016	DebugLoc DL = MI.getDebugLoc();
26017	MachineFunction &MF = *BB->getParent();
26018
26019	// Emit CALLSEQ_START right before the instruction.
26020	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26021	MachineInstrBuilder CallseqStart =
26022	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26023	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26024
26025	// Emit CALLSEQ_END right after the instruction.
26026	// We don't call erase from parent because we want to keep the
26027	// original instruction around.
26028	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26029	MachineInstrBuilder CallseqEnd =
26030	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26031	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26032
26033	return BB;
26034	}
26035
26036	MachineBasicBlock *
26037	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26038	MachineBasicBlock *BB) const {
26039	// This is pretty easy. We're taking the value that we received from
26040	// our load from the relocation, sticking it in either RDI (x86-64)
26041	// or EAX and doing an indirect call. The return value will then
26042	// be in the normal return register.
26043	MachineFunction *F = BB->getParent();
26044	const X86InstrInfo *TII = Subtarget.getInstrInfo();
26045	DebugLoc DL = MI.getDebugLoc();
26046
26047	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26047, __PRETTY_FUNCTION__));
26048	assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global" ) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26048, __PRETTY_FUNCTION__));
26049
26050	// Get a register mask for the lowered call.
26051	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
26052	// proper register mask.
26053	const uint32_t *RegMask =
26054	Subtarget.is64Bit() ?
26055	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26056	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26057	if (Subtarget.is64Bit()) {
26058	MachineInstrBuilder MIB =
26059	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26060	.addReg(X86::RIP)
26061	.addImm(0)
26062	.addReg(0)
26063	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26064	MI.getOperand(3).getTargetFlags())
26065	.addReg(0);
26066	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26067	addDirectMem(MIB, X86::RDI);
26068	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26069	} else if (!isPositionIndependent()) {
26070	MachineInstrBuilder MIB =
26071	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26072	.addReg(0)
26073	.addImm(0)
26074	.addReg(0)
26075	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26076	MI.getOperand(3).getTargetFlags())
26077	.addReg(0);
26078	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26079	addDirectMem(MIB, X86::EAX);
26080	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26081	} else {
26082	MachineInstrBuilder MIB =
26083	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26084	.addReg(TII->getGlobalBaseReg(F))
26085	.addImm(0)
26086	.addReg(0)
26087	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26088	MI.getOperand(3).getTargetFlags())
26089	.addReg(0);
26090	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26091	addDirectMem(MIB, X86::EAX);
26092	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26093	}
26094
26095	MI.eraseFromParent(); // The pseudo instruction is gone now.
26096	return BB;
26097	}
26098
26099	MachineBasicBlock *
26100	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26101	MachineBasicBlock *MBB) const {
26102	DebugLoc DL = MI.getDebugLoc();
26103	MachineFunction *MF = MBB->getParent();
26104	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26105	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26106	MachineRegisterInfo &MRI = MF->getRegInfo();
26107
26108	const BasicBlock *BB = MBB->getBasicBlock();
26109	MachineFunction::iterator I = ++MBB->getIterator();
26110
26111	// Memory Reference
26112	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26113	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26114
26115	unsigned DstReg;
26116	unsigned MemOpndSlot = 0;
26117
26118	unsigned CurOp = 0;
26119
26120	DstReg = MI.getOperand(CurOp++).getReg();
26121	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26122	assert(TRI->isTypeLegalForClass(RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(RC, MVT::i32) && "Invalid destination!" ) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26122, __PRETTY_FUNCTION__));
26123	(void)TRI;
26124	unsigned mainDstReg = MRI.createVirtualRegister(RC);
26125	unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26126
26127	MemOpndSlot = CurOp;
26128
26129	MVT PVT = getPointerTy(MF->getDataLayout());
26130	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&(((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!" ) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 \|\| PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26131, __PRETTY_FUNCTION__))
26131	"Invalid Pointer Size!")(((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!" ) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 \|\| PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26131, __PRETTY_FUNCTION__));
26132
26133	// For v = setjmp(buf), we generate
26134	//
26135	// thisMBB:
26136	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26137	// SjLjSetup restoreMBB
26138	//
26139	// mainMBB:
26140	// v_main = 0
26141	//
26142	// sinkMBB:
26143	// v = phi(main, restore)
26144	//
26145	// restoreMBB:
26146	// if base pointer being used, load it from frame
26147	// v_restore = 1
26148
26149	MachineBasicBlock *thisMBB = MBB;
26150	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26151	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26152	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26153	MF->insert(I, mainMBB);
26154	MF->insert(I, sinkMBB);
26155	MF->push_back(restoreMBB);
26156	restoreMBB->setHasAddressTaken();
26157
26158	MachineInstrBuilder MIB;
26159
26160	// Transfer the remainder of BB and its successor edges to sinkMBB.
26161	sinkMBB->splice(sinkMBB->begin(), MBB,
26162	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26163	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26164
26165	// thisMBB:
26166	unsigned PtrStoreOpc = 0;
26167	unsigned LabelReg = 0;
26168	const int64_t LabelOffset = 1 * PVT.getStoreSize();
26169	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26170	!isPositionIndependent();
26171
26172	// Prepare IP either in reg or imm.
26173	if (!UseImmLabel) {
26174	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26175	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26176	LabelReg = MRI.createVirtualRegister(PtrRC);
26177	if (Subtarget.is64Bit()) {
26178	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26179	.addReg(X86::RIP)
26180	.addImm(0)
26181	.addReg(0)
26182	.addMBB(restoreMBB)
26183	.addReg(0);
26184	} else {
26185	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
26186	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26187	.addReg(XII->getGlobalBaseReg(MF))
26188	.addImm(0)
26189	.addReg(0)
26190	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26191	.addReg(0);
26192	}
26193	} else
26194	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26195	// Store IP
26196	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26197	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26198	if (i == X86::AddrDisp)
26199	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26200	else
26201	MIB.add(MI.getOperand(MemOpndSlot + i));
26202	}
26203	if (!UseImmLabel)
26204	MIB.addReg(LabelReg);
26205	else
26206	MIB.addMBB(restoreMBB);
26207	MIB.setMemRefs(MMOBegin, MMOEnd);
26208	// Setup
26209	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26210	.addMBB(restoreMBB);
26211
26212	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26213	MIB.addRegMask(RegInfo->getNoPreservedMask());
26214	thisMBB->addSuccessor(mainMBB);
26215	thisMBB->addSuccessor(restoreMBB);
26216
26217	// mainMBB:
26218	// EAX = 0
26219	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26220	mainMBB->addSuccessor(sinkMBB);
26221
26222	// sinkMBB:
26223	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26224	TII->get(X86::PHI), DstReg)
26225	.addReg(mainDstReg).addMBB(mainMBB)
26226	.addReg(restoreDstReg).addMBB(restoreMBB);
26227
26228	// restoreMBB:
26229	if (RegInfo->hasBasePointer(*MF)) {
26230	const bool Uses64BitFramePtr =
26231	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
26232	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26233	X86FI->setRestoreBasePointer(MF);
26234	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26235	unsigned BasePtr = RegInfo->getBaseRegister();
26236	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26237	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26238	FramePtr, true, X86FI->getRestoreBasePointerOffset())
26239	.setMIFlag(MachineInstr::FrameSetup);
26240	}
26241	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26242	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26243	restoreMBB->addSuccessor(sinkMBB);
26244
26245	MI.eraseFromParent();
26246	return sinkMBB;
26247	}
26248
26249	MachineBasicBlock *
26250	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26251	MachineBasicBlock *MBB) const {
26252	DebugLoc DL = MI.getDebugLoc();
26253	MachineFunction *MF = MBB->getParent();
26254	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26255	MachineRegisterInfo &MRI = MF->getRegInfo();
26256
26257	// Memory Reference
26258	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26259	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26260
26261	MVT PVT = getPointerTy(MF->getDataLayout());
26262	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&(((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!" ) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 \|\| PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26263, __PRETTY_FUNCTION__))
26263	"Invalid Pointer Size!")(((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!" ) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 \|\| PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26263, __PRETTY_FUNCTION__));
26264
26265	const TargetRegisterClass *RC =
26266	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26267	unsigned Tmp = MRI.createVirtualRegister(RC);
26268	// Since FP is only updated here but NOT referenced, it's treated as GPR.
26269	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26270	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26271	unsigned SP = RegInfo->getStackRegister();
26272
26273	MachineInstrBuilder MIB;
26274
26275	const int64_t LabelOffset = 1 * PVT.getStoreSize();
26276	const int64_t SPOffset = 2 * PVT.getStoreSize();
26277
26278	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26279	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26280
26281	// Reload FP
26282	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26283	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26284	MIB.add(MI.getOperand(i));
26285	MIB.setMemRefs(MMOBegin, MMOEnd);
26286	// Reload IP
26287	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26288	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26289	if (i == X86::AddrDisp)
26290	MIB.addDisp(MI.getOperand(i), LabelOffset);
26291	else
26292	MIB.add(MI.getOperand(i));
26293	}
26294	MIB.setMemRefs(MMOBegin, MMOEnd);
26295	// Reload SP
26296	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26297	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26298	if (i == X86::AddrDisp)
26299	MIB.addDisp(MI.getOperand(i), SPOffset);
26300	else
26301	MIB.add(MI.getOperand(i));
26302	}
26303	MIB.setMemRefs(MMOBegin, MMOEnd);
26304	// Jump
26305	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26306
26307	MI.eraseFromParent();
26308	return MBB;
26309	}
26310
26311	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26312	MachineBasicBlock *MBB,
26313	MachineBasicBlock *DispatchBB,
26314	int FI) const {
26315	DebugLoc DL = MI.getDebugLoc();
26316	MachineFunction *MF = MBB->getParent();
26317	MachineRegisterInfo *MRI = &MF->getRegInfo();
26318	const X86InstrInfo *TII = Subtarget.getInstrInfo();
26319
26320	MVT PVT = getPointerTy(MF->getDataLayout());
26321	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!" ) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 \|\| PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26321, __PRETTY_FUNCTION__));
26322
26323	unsigned Op = 0;
26324	unsigned VR = 0;
26325
26326	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26327	!isPositionIndependent();
26328
26329	if (UseImmLabel) {
26330	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26331	} else {
26332	const TargetRegisterClass *TRC =
26333	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26334	VR = MRI->createVirtualRegister(TRC);
26335	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26336
26337	if (Subtarget.is64Bit())
26338	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26339	.addReg(X86::RIP)
26340	.addImm(1)
26341	.addReg(0)
26342	.addMBB(DispatchBB)
26343	.addReg(0);
26344	else
26345	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26346	.addReg(0) /* TII->getGlobalBaseReg(MF) */
26347	.addImm(1)
26348	.addReg(0)
26349	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26350	.addReg(0);
26351	}
26352
26353	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26354	addFrameReference(MIB, FI, 36);
26355	if (UseImmLabel)
26356	MIB.addMBB(DispatchBB);
26357	else
26358	MIB.addReg(VR);
26359	}
26360
26361	MachineBasicBlock *
26362	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26363	MachineBasicBlock *BB) const {
26364	DebugLoc DL = MI.getDebugLoc();
26365	MachineFunction *MF = BB->getParent();
26366	MachineFrameInfo &MFI = MF->getFrameInfo();
26367	MachineRegisterInfo *MRI = &MF->getRegInfo();
26368	const X86InstrInfo *TII = Subtarget.getInstrInfo();
26369	int FI = MFI.getFunctionContextIndex();
26370
26371	// Get a mapping of the call site numbers to all of the landing pads they're
26372	// associated with.
26373	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26374	unsigned MaxCSNum = 0;
26375	for (auto &MBB : *MF) {
26376	if (!MBB.isEHPad())
26377	continue;
26378
26379	MCSymbol *Sym = nullptr;
26380	for (const auto &MI : MBB) {
26381	if (MI.isDebugValue())
26382	continue;
26383
26384	assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast <void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26384, __PRETTY_FUNCTION__));
26385	Sym = MI.getOperand(0).getMCSymbol();
26386	break;
26387	}
26388
26389	if (!MF->hasCallSiteLandingPad(Sym))
26390	continue;
26391
26392	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26393	CallSiteNumToLPad[CSI].push_back(&MBB);
26394	MaxCSNum = std::max(MaxCSNum, CSI);
26395	}
26396	}
26397
26398	// Get an ordered list of the machine basic blocks for the jump table.
26399	std::vector<MachineBasicBlock *> LPadList;
26400	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26401	LPadList.reserve(CallSiteNumToLPad.size());
26402
26403	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26404	for (auto &LP : CallSiteNumToLPad[CSI]) {
26405	LPadList.push_back(LP);
26406	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26407	}
26408	}
26409
26410	assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!" ) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26411, __PRETTY_FUNCTION__))
26411	"No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!" ) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26411, __PRETTY_FUNCTION__));
26412
26413	// Create the MBBs for the dispatch code.
26414
26415	// Shove the dispatch's address into the return slot in the function context.
26416	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26417	DispatchBB->setIsEHPad(true);
26418
26419	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26420	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26421	DispatchBB->addSuccessor(TrapBB);
26422
26423	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26424	DispatchBB->addSuccessor(DispContBB);
26425
26426	// Insert MBBs.
26427	MF->push_back(DispatchBB);
26428	MF->push_back(DispContBB);
26429	MF->push_back(TrapBB);
26430
26431	// Insert code into the entry block that creates and registers the function
26432	// context.
26433	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26434
26435	// Create the jump table and associated information
26436	MachineJumpTableInfo *JTI =
26437	MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26438	unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26439
26440	const X86RegisterInfo &RI = TII->getRegisterInfo();
26441	// Add a register mask with no preserved registers. This results in all
26442	// registers being marked as clobbered.
26443	if (RI.hasBasePointer(*MF)) {
26444	const bool FPIs64Bit =
26445	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
26446	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26447	MFI->setRestoreBasePointer(MF);
26448
26449	unsigned FP = RI.getFrameRegister(*MF);
26450	unsigned BP = RI.getBaseRegister();
26451	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26452	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26453	MFI->getRestoreBasePointerOffset())
26454	.addRegMask(RI.getNoPreservedMask());
26455	} else {
26456	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26457	.addRegMask(RI.getNoPreservedMask());
26458	}
26459
26460	unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26461	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26462	4);
26463	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26464	.addReg(IReg)
26465	.addImm(LPadList.size());
26466	BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26467
26468	unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26469	BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26470	.addReg(IReg)
26471	.addImm(1);
26472	BuildMI(DispContBB, DL,
26473	TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26474	.addReg(0)
26475	.addImm(Subtarget.is64Bit() ? 8 : 4)
26476	.addReg(JReg)
26477	.addJumpTableIndex(MJTI)
26478	.addReg(0);
26479
26480	// Add the jump table entries as successors to the MBB.
26481	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26482	for (auto &LP : LPadList)
26483	if (SeenMBBs.insert(LP).second)
26484	DispContBB->addSuccessor(LP);
26485
26486	// N.B. the order the invoke BBs are processed in doesn't matter here.
26487	SmallVector<MachineBasicBlock *, 64> MBBLPads;
26488	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26489	for (MachineBasicBlock *MBB : InvokeBBs) {
26490	// Remove the landing pad successor from the invoke block and replace it
26491	// with the new dispatch block.
26492	// Keep a copy of Successors since it's modified inside the loop.
26493	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26494	MBB->succ_rend());
26495	// FIXME: Avoid quadratic complexity.
26496	for (auto MBBS : Successors) {
26497	if (MBBS->isEHPad()) {
26498	MBB->removeSuccessor(MBBS);
26499	MBBLPads.push_back(MBBS);
26500	}
26501	}
26502
26503	MBB->addSuccessor(DispatchBB);
26504
26505	// Find the invoke call and mark all of the callee-saved registers as
26506	// 'implicit defined' so that they're spilled. This prevents code from
26507	// moving instructions to before the EH block, where they will never be
26508	// executed.
26509	for (auto &II : reverse(*MBB)) {
26510	if (!II.isCall())
26511	continue;
26512
26513	DenseMap<unsigned, bool> DefRegs;
26514	for (auto &MOp : II.operands())
26515	if (MOp.isReg())
26516	DefRegs[MOp.getReg()] = true;
26517
26518	MachineInstrBuilder MIB(*MF, &II);
26519	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26520	unsigned Reg = SavedRegs[RI];
26521	if (!DefRegs[Reg])
26522	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
26523	}
26524
26525	break;
26526	}
26527	}
26528
26529	// Mark all former landing pads as non-landing pads. The dispatch is the only
26530	// landing pad now.
26531	for (auto &LP : MBBLPads)
26532	LP->setIsEHPad(false);
26533
26534	// The instruction is gone now.
26535	MI.eraseFromParent();
26536	return BB;
26537	}
26538
26539	MachineBasicBlock *
26540	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26541	MachineBasicBlock *BB) const {
26542	MachineFunction *MF = BB->getParent();
26543	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26544	DebugLoc DL = MI.getDebugLoc();
26545
26546	switch (MI.getOpcode()) {
26547	default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26547);
26548	case X86::TAILJMPd64:
26549	case X86::TAILJMPr64:
26550	case X86::TAILJMPm64:
26551	case X86::TAILJMPr64_REX:
26552	case X86::TAILJMPm64_REX:
26553	llvm_unreachable("TAILJMP64 would not be touched here.")::llvm::llvm_unreachable_internal("TAILJMP64 would not be touched here." , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26553);
26554	case X86::TCRETURNdi64:
26555	case X86::TCRETURNri64:
26556	case X86::TCRETURNmi64:
26557	return BB;
26558	case X86::TLS_addr32:
26559	case X86::TLS_addr64:
26560	case X86::TLS_base_addr32:
26561	case X86::TLS_base_addr64:
26562	return EmitLoweredTLSAddr(MI, BB);
26563	case X86::CATCHRET:
26564	return EmitLoweredCatchRet(MI, BB);
26565	case X86::CATCHPAD:
26566	return EmitLoweredCatchPad(MI, BB);
26567	case X86::SEG_ALLOCA_32:
26568	case X86::SEG_ALLOCA_64:
26569	return EmitLoweredSegAlloca(MI, BB);
26570	case X86::TLSCall_32:
26571	case X86::TLSCall_64:
26572	return EmitLoweredTLSCall(MI, BB);
26573	case X86::CMOV_FR32:
26574	case X86::CMOV_FR64:
26575	case X86::CMOV_FR128:
26576	case X86::CMOV_GR8:
26577	case X86::CMOV_GR16:
26578	case X86::CMOV_GR32:
26579	case X86::CMOV_RFP32:
26580	case X86::CMOV_RFP64:
26581	case X86::CMOV_RFP80:
26582	case X86::CMOV_V2F64:
26583	case X86::CMOV_V2I64:
26584	case X86::CMOV_V4F32:
26585	case X86::CMOV_V4F64:
26586	case X86::CMOV_V4I64:
26587	case X86::CMOV_V16F32:
26588	case X86::CMOV_V8F32:
26589	case X86::CMOV_V8F64:
26590	case X86::CMOV_V8I64:
26591	case X86::CMOV_V8I1:
26592	case X86::CMOV_V16I1:
26593	case X86::CMOV_V32I1:
26594	case X86::CMOV_V64I1:
26595	return EmitLoweredSelect(MI, BB);
26596
26597	case X86::RDFLAGS32:
26598	case X86::RDFLAGS64: {
26599	unsigned PushF =
26600	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26601	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26602	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
26603	// Permit reads of the FLAGS register without it being defined.
26604	// This intrinsic exists to read external processor state in flags, such as
26605	// the trap flag, interrupt flag, and direction flag, none of which are
26606	// modeled by the backend.
26607	Push->getOperand(2).setIsUndef();
26608	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26609
26610	MI.eraseFromParent(); // The pseudo is gone now.
26611	return BB;
26612	}
26613
26614	case X86::WRFLAGS32:
26615	case X86::WRFLAGS64: {
26616	unsigned Push =
26617	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26618	unsigned PopF =
26619	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26620	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26621	BuildMI(*BB, MI, DL, TII->get(PopF));
26622
26623	MI.eraseFromParent(); // The pseudo is gone now.
26624	return BB;
26625	}
26626
26627	case X86::RELEASE_FADD32mr:
26628	case X86::RELEASE_FADD64mr:
26629	return EmitLoweredAtomicFP(MI, BB);
26630
26631	case X86::FP32_TO_INT16_IN_MEM:
26632	case X86::FP32_TO_INT32_IN_MEM:
26633	case X86::FP32_TO_INT64_IN_MEM:
26634	case X86::FP64_TO_INT16_IN_MEM:
26635	case X86::FP64_TO_INT32_IN_MEM:
26636	case X86::FP64_TO_INT64_IN_MEM:
26637	case X86::FP80_TO_INT16_IN_MEM:
26638	case X86::FP80_TO_INT32_IN_MEM:
26639	case X86::FP80_TO_INT64_IN_MEM: {
26640	// Change the floating point control register to use "round towards zero"
26641	// mode when truncating to an integer value.
26642	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26643	addFrameReference(BuildMI(*BB, MI, DL,
26644	TII->get(X86::FNSTCW16m)), CWFrameIdx);
26645
26646	// Load the old value of the high byte of the control word...
26647	unsigned OldCW =
26648	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26649	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26650	CWFrameIdx);
26651
26652	// Set the high part to be round to zero...
26653	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26654	.addImm(0xC7F);
26655
26656	// Reload the modified control word now...
26657	addFrameReference(BuildMI(*BB, MI, DL,
26658	TII->get(X86::FLDCW16m)), CWFrameIdx);
26659
26660	// Restore the memory image of control word to original value
26661	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26662	.addReg(OldCW);
26663
26664	// Get the X86 opcode to use.
26665	unsigned Opc;
26666	switch (MI.getOpcode()) {
26667	default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26667);
26668	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26669	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26670	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26671	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26672	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26673	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26674	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26675	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26676	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26677	}
26678
26679	X86AddressMode AM = getAddressFromInstr(&MI, 0);
26680	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26681	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26682
26683	// Reload the original control word now.
26684	addFrameReference(BuildMI(*BB, MI, DL,
26685	TII->get(X86::FLDCW16m)), CWFrameIdx);
26686
26687	MI.eraseFromParent(); // The pseudo instruction is gone now.
26688	return BB;
26689	}
26690	// String/text processing lowering.
26691	case X86::PCMPISTRM128REG:
26692	case X86::VPCMPISTRM128REG:
26693	case X86::PCMPISTRM128MEM:
26694	case X86::VPCMPISTRM128MEM:
26695	case X86::PCMPESTRM128REG:
26696	case X86::VPCMPESTRM128REG:
26697	case X86::PCMPESTRM128MEM:
26698	case X86::VPCMPESTRM128MEM:
26699	assert(Subtarget.hasSSE42() &&((Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26700, __PRETTY_FUNCTION__))
26700	"Target must have SSE4.2 or AVX features enabled")((Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26700, __PRETTY_FUNCTION__));
26701	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26702
26703	// String/text processing lowering.
26704	case X86::PCMPISTRIREG:
26705	case X86::VPCMPISTRIREG:
26706	case X86::PCMPISTRIMEM:
26707	case X86::VPCMPISTRIMEM:
26708	case X86::PCMPESTRIREG:
26709	case X86::VPCMPESTRIREG:
26710	case X86::PCMPESTRIMEM:
26711	case X86::VPCMPESTRIMEM:
26712	assert(Subtarget.hasSSE42() &&((Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26713, __PRETTY_FUNCTION__))
26713	"Target must have SSE4.2 or AVX features enabled")((Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26713, __PRETTY_FUNCTION__));
26714	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26715
26716	// Thread synchronization.
26717	case X86::MONITOR:
26718	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26719	case X86::MONITORX:
26720	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26721
26722	// Cache line zero
26723	case X86::CLZERO:
26724	return emitClzero(&MI, BB, Subtarget);
26725
26726	// PKU feature
26727	case X86::WRPKRU:
26728	return emitWRPKRU(MI, BB, Subtarget);
26729	case X86::RDPKRU:
26730	return emitRDPKRU(MI, BB, Subtarget);
26731	// xbegin
26732	case X86::XBEGIN:
26733	return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26734
26735	case X86::VASTART_SAVE_XMM_REGS:
26736	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26737
26738	case X86::VAARG_64:
26739	return EmitVAARG64WithCustomInserter(MI, BB);
26740
26741	case X86::EH_SjLj_SetJmp32:
26742	case X86::EH_SjLj_SetJmp64:
26743	return emitEHSjLjSetJmp(MI, BB);
26744
26745	case X86::EH_SjLj_LongJmp32:
26746	case X86::EH_SjLj_LongJmp64:
26747	return emitEHSjLjLongJmp(MI, BB);
26748
26749	case X86::Int_eh_sjlj_setup_dispatch:
26750	return EmitSjLjDispatchBlock(MI, BB);
26751
26752	case TargetOpcode::STATEPOINT:
26753	// As an implementation detail, STATEPOINT shares the STACKMAP format at
26754	// this point in the process. We diverge later.
26755	return emitPatchPoint(MI, BB);
26756
26757	case TargetOpcode::STACKMAP:
26758	case TargetOpcode::PATCHPOINT:
26759	return emitPatchPoint(MI, BB);
26760
26761	case TargetOpcode::PATCHABLE_EVENT_CALL:
26762	// Do nothing here, handle in xray instrumentation pass.
26763	return BB;
26764
26765	case X86::LCMPXCHG8B: {
26766	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26767	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26768	// requires a memory operand. If it happens that current architecture is
26769	// i686 and for current function we need a base pointer
26770	// - which is ESI for i686 - register allocator would not be able to
26771	// allocate registers for an address in form of X(%reg, %reg, Y)
26772	// - there never would be enough unreserved registers during regalloc
26773	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
26774	// We are giving a hand to register allocator by precomputing the address in
26775	// a new vreg using LEA.
26776
26777	// If it is not i686 or there is no base pointer - nothing to do here.
26778	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
26779	return BB;
26780
26781	// Even though this code does not necessarily needs the base pointer to
26782	// be ESI, we check for that. The reason: if this assert fails, there are
26783	// some changes happened in the compiler base pointer handling, which most
26784	// probably have to be addressed somehow here.
26785	assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? static_cast<void> (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26787, __PRETTY_FUNCTION__))
26786	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? static_cast<void> (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26787, __PRETTY_FUNCTION__))
26787	"base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? static_cast<void> (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26787, __PRETTY_FUNCTION__));
26788
26789	MachineRegisterInfo &MRI = MF->getRegInfo();
26790	MVT SPTy = getPointerTy(MF->getDataLayout());
26791	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26792	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26793
26794	X86AddressMode AM = getAddressFromInstr(&MI, 0);
26795	// Regalloc does not need any help when the memory operand of CMPXCHG8B
26796	// does not use index register.
26797	if (AM.IndexReg == X86::NoRegister)
26798	return BB;
26799
26800	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26801	// four operand definitions that are E[ABCD] registers. We skip them and
26802	// then insert the LEA.
26803	MachineBasicBlock::iterator MBBI(MI);
26804	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
26805	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
26806	--MBBI;
26807	addFullAddress(
26808	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26809
26810	setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26811
26812	return BB;
26813	}
26814	case X86::LCMPXCHG16B:
26815	return BB;
26816	case X86::LCMPXCHG8B_SAVE_EBX:
26817	case X86::LCMPXCHG16B_SAVE_RBX: {
26818	unsigned BasePtr =
26819	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26820	if (!BB->isLiveIn(BasePtr))
26821	BB->addLiveIn(BasePtr);
26822	return BB;
26823	}
26824	}
26825	}
26826
26827	//===----------------------------------------------------------------------===//
26828	// X86 Optimization Hooks
26829	//===----------------------------------------------------------------------===//
26830
26831	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26832	KnownBits &Known,
26833	const APInt &DemandedElts,
26834	const SelectionDAG &DAG,
26835	unsigned Depth) const {
26836	unsigned BitWidth = Known.getBitWidth();
26837	unsigned Opc = Op.getOpcode();
26838	EVT VT = Op.getValueType();
26839	assert((Opc >= ISD::BUILTIN_OP_END \|\|(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__))
26840	Opc == ISD::INTRINSIC_WO_CHAIN \|\|(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__))
26841	Opc == ISD::INTRINSIC_W_CHAIN \|\|(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__))
26842	Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__))
26843	"Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__))
26844	" is a target node!")(((Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID ) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? static_cast<void> (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END \|\| Opc == ISD::INTRINSIC_WO_CHAIN \|\| Opc == ISD::INTRINSIC_W_CHAIN \|\| Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26844, __PRETTY_FUNCTION__));
26845
26846	Known.resetAll();
26847	switch (Opc) {
26848	default: break;
26849	case X86ISD::ADD:
26850	case X86ISD::SUB:
26851	case X86ISD::ADC:
26852	case X86ISD::SBB:
26853	case X86ISD::SMUL:
26854	case X86ISD::UMUL:
26855	case X86ISD::INC:
26856	case X86ISD::DEC:
26857	case X86ISD::OR:
26858	case X86ISD::XOR:
26859	case X86ISD::AND:
26860	// These nodes' second result is a boolean.
26861	if (Op.getResNo() == 0)
26862	break;
26863	LLVM_FALLTHROUGH[[clang::fallthrough]];
26864	case X86ISD::SETCC:
26865	Known.Zero.setBitsFrom(1);
26866	break;
26867	case X86ISD::MOVMSK: {
26868	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26869	Known.Zero.setBitsFrom(NumLoBits);
26870	break;
26871	}
26872	case X86ISD::VSHLI:
26873	case X86ISD::VSRLI: {
26874	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26875	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26876	Known.setAllZero();
26877	break;
26878	}
26879
26880	DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26881	unsigned ShAmt = ShiftImm->getZExtValue();
26882	if (Opc == X86ISD::VSHLI) {
26883	Known.Zero <<= ShAmt;
26884	Known.One <<= ShAmt;
26885	// Low bits are known zero.
26886	Known.Zero.setLowBits(ShAmt);
26887	} else {
26888	Known.Zero.lshrInPlace(ShAmt);
26889	Known.One.lshrInPlace(ShAmt);
26890	// High bits are known zero.
26891	Known.Zero.setHighBits(ShAmt);
26892	}
26893	}
26894	break;
26895	}
26896	case X86ISD::VZEXT: {
26897	SDValue N0 = Op.getOperand(0);
26898	unsigned NumElts = VT.getVectorNumElements();
26899
26900	EVT SrcVT = N0.getValueType();
26901	unsigned InNumElts = SrcVT.getVectorNumElements();
26902	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26903	assert(InNumElts >= NumElts && "Illegal VZEXT input")((InNumElts >= NumElts && "Illegal VZEXT input") ? static_cast<void> (0) : __assert_fail ("InNumElts >= NumElts && \"Illegal VZEXT input\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 26903, __PRETTY_FUNCTION__));
26904
26905	Known = KnownBits(InBitWidth);
26906	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26907	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26908	Known = Known.zext(BitWidth);
26909	Known.Zero.setBitsFrom(InBitWidth);
26910	break;
26911	}
26912	}
26913	}
26914
26915	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26916	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26917	unsigned Depth) const {
26918	unsigned VTBits = Op.getScalarValueSizeInBits();
26919	unsigned Opcode = Op.getOpcode();
26920	switch (Opcode) {
26921	case X86ISD::SETCC_CARRY:
26922	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26923	return VTBits;
26924
26925	case X86ISD::VSEXT: {
26926	SDValue Src = Op.getOperand(0);
26927	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26928	Tmp += VTBits - Src.getScalarValueSizeInBits();
26929	return Tmp;
26930	}
26931
26932	case X86ISD::VSHLI: {
26933	SDValue Src = Op.getOperand(0);
26934	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26935	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26936	if (ShiftVal.uge(VTBits))
26937	return VTBits; // Shifted all bits out --> zero.
26938	if (ShiftVal.uge(Tmp))
26939	return 1; // Shifted all sign bits out --> unknown.
26940	return Tmp - ShiftVal.getZExtValue();
26941	}
26942
26943	case X86ISD::VSRAI: {
26944	SDValue Src = Op.getOperand(0);
26945	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26946	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26947	ShiftVal += Tmp;
26948	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26949	}
26950
26951	case X86ISD::PCMPGT:
26952	case X86ISD::PCMPEQ:
26953	case X86ISD::CMPP:
26954	case X86ISD::VPCOM:
26955	case X86ISD::VPCOMU:
26956	// Vector compares return zero/all-bits result values.
26957	return VTBits;
26958	}
26959
26960	// Fallback case.
26961	return 1;
26962	}
26963
26964	/// Returns true (and the GlobalValue and the offset) if the node is a
26965	/// GlobalAddress + offset.
26966	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26967	const GlobalValue* &GA,
26968	int64_t &Offset) const {
26969	if (N->getOpcode() == X86ISD::Wrapper) {
26970	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26971	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26972	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26973	return true;
26974	}
26975	}
26976	return TargetLowering::isGAPlusOffset(N, GA, Offset);
26977	}
26978
26979	// Attempt to match a combined shuffle mask against supported unary shuffle
26980	// instructions.
26981	// TODO: Investigate sharing more of this with shuffle lowering.
26982	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26983	bool AllowFloatDomain, bool AllowIntDomain,
26984	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26985	const X86Subtarget &Subtarget,
26986	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26987	unsigned NumMaskElts = Mask.size();
26988	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26989
26990	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26991	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26992	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
26993	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26994	unsigned MaxScale = 64 / MaskEltSize;
26995	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26996	bool Match = true;
26997	unsigned NumDstElts = NumMaskElts / Scale;
26998	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26999	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27000	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27001	}
27002	if (Match) {
27003	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27004	SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27005	if (SrcVT != MaskVT)
27006	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27007	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27008	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27009	Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27010	: unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27011	return true;
27012	}
27013	}
27014	}
27015
27016	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27017	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27018	isUndefOrEqual(Mask[0], 0) &&
27019	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27020	Shuffle = X86ISD::VZEXT_MOVL;
27021	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27022	return true;
27023	}
27024
27025	// Check if we have SSE3 which will let us use MOVDDUP etc. The
27026	// instructions are no slower than UNPCKLPD but has the option to
27027	// fold the input operand into even an unaligned memory load.
27028	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27029	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27030	Shuffle = X86ISD::MOVDDUP;
27031	SrcVT = DstVT = MVT::v2f64;
27032	return true;
27033	}
27034	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27035	Shuffle = X86ISD::MOVSLDUP;
27036	SrcVT = DstVT = MVT::v4f32;
27037	return true;
27038	}
27039	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27040	Shuffle = X86ISD::MOVSHDUP;
27041	SrcVT = DstVT = MVT::v4f32;
27042	return true;
27043	}
27044	}
27045
27046	if (MaskVT.is256BitVector() && AllowFloatDomain) {
27047	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27047, __PRETTY_FUNCTION__));
27048	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27049	Shuffle = X86ISD::MOVDDUP;
27050	SrcVT = DstVT = MVT::v4f64;
27051	return true;
27052	}
27053	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27054	Shuffle = X86ISD::MOVSLDUP;
27055	SrcVT = DstVT = MVT::v8f32;
27056	return true;
27057	}
27058	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27059	Shuffle = X86ISD::MOVSHDUP;
27060	SrcVT = DstVT = MVT::v8f32;
27061	return true;
27062	}
27063	}
27064
27065	if (MaskVT.is512BitVector() && AllowFloatDomain) {
27066	assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27067, __PRETTY_FUNCTION__))
27067	"AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles" ) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27067, __PRETTY_FUNCTION__));
27068	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27069	Shuffle = X86ISD::MOVDDUP;
27070	SrcVT = DstVT = MVT::v8f64;
27071	return true;
27072	}
27073	if (isTargetShuffleEquivalent(
27074	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27075	Shuffle = X86ISD::MOVSLDUP;
27076	SrcVT = DstVT = MVT::v16f32;
27077	return true;
27078	}
27079	if (isTargetShuffleEquivalent(
27080	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27081	Shuffle = X86ISD::MOVSHDUP;
27082	SrcVT = DstVT = MVT::v16f32;
27083	return true;
27084	}
27085	}
27086
27087	// Attempt to match against broadcast-from-vector.
27088	if (Subtarget.hasAVX2()) {
27089	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27090	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27091	SrcVT = DstVT = MaskVT;
27092	Shuffle = X86ISD::VBROADCAST;
27093	return true;
27094	}
27095	}
27096
27097	return false;
27098	}
27099
27100	// Attempt to match a combined shuffle mask against supported unary immediate
27101	// permute instructions.
27102	// TODO: Investigate sharing more of this with shuffle lowering.
27103	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27104	bool AllowFloatDomain,
27105	bool AllowIntDomain,
27106	const X86Subtarget &Subtarget,
27107	unsigned &Shuffle, MVT &ShuffleVT,
27108	unsigned &PermuteImm) {
27109	unsigned NumMaskElts = Mask.size();
27110
27111	bool ContainsZeros = false;
27112	APInt Zeroable(NumMaskElts, false);
27113	for (unsigned i = 0; i != NumMaskElts; ++i) {
27114	int M = Mask[i];
27115	if (isUndefOrZero(M))
27116	Zeroable.setBit(i);
27117	ContainsZeros \|= (M == SM_SentinelZero);
27118	}
27119
27120	// Attempt to match against byte/bit shifts.
27121	// FIXME: Add 512-bit support.
27122	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
27123	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27124	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27125	MaskVT.getScalarSizeInBits(), Mask,
27126	0, Zeroable, Subtarget);
27127	if (0 < ShiftAmt) {
27128	PermuteImm = (unsigned)ShiftAmt;
27129	return true;
27130	}
27131	}
27132
27133	// Ensure we don't contain any zero elements.
27134	if (ContainsZeros)
27135	return false;
27136
27137	assert(llvm::all_of(Mask, [&](int M) {((llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && "Expected unary shuffle" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && \"Expected unary shuffle\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27139, __PRETTY_FUNCTION__))
27138	return SM_SentinelUndef <= M && M < (int)NumMaskElts;((llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && "Expected unary shuffle" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && \"Expected unary shuffle\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27139, __PRETTY_FUNCTION__))
27139	}) && "Expected unary shuffle")((llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && "Expected unary shuffle" ) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return SM_SentinelUndef <= M && M < (int)NumMaskElts; }) && \"Expected unary shuffle\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27139, __PRETTY_FUNCTION__));
27140
27141	unsigned InputSizeInBits = MaskVT.getSizeInBits();
27142	unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
27143	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27144
27145	// Handle PSHUFLW/PSHUFHW repeated patterns.
27146	if (MaskScalarSizeInBits == 16) {
27147	SmallVector<int, 4> RepeatedMask;
27148	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27149	ArrayRef<int> LoMask(Mask.data() + 0, 4);
27150	ArrayRef<int> HiMask(Mask.data() + 4, 4);
27151
27152	// PSHUFLW: permute lower 4 elements only.
27153	if (isUndefOrInRange(LoMask, 0, 4) &&
27154	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27155	Shuffle = X86ISD::PSHUFLW;
27156	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27157	PermuteImm = getV4X86ShuffleImm(LoMask);
27158	return true;
27159	}
27160
27161	// PSHUFHW: permute upper 4 elements only.
27162	if (isUndefOrInRange(HiMask, 4, 8) &&
27163	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27164	// Offset the HiMask so that we can create the shuffle immediate.
27165	int OffsetHiMask[4];
27166	for (int i = 0; i != 4; ++i)
27167	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27168
27169	Shuffle = X86ISD::PSHUFHW;
27170	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27171	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27172	return true;
27173	}
27174
27175	return false;
27176	}
27177	return false;
27178	}
27179
27180	// We only support permutation of 32/64 bit elements after this.
27181	if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
27182	return false;
27183
27184	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27185	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27186	if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
27187	return false;
27188
27189	// Pre-AVX2 we must use float shuffles on 256-bit vectors.
27190	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
27191	AllowFloatDomain = true;
27192	AllowIntDomain = false;
	Value stored to 'AllowIntDomain' is never read
27193	}
27194
27195	// Check for lane crossing permutes.
27196	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27197	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27198	if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
27199	Shuffle = X86ISD::VPERMI;
27200	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27201	PermuteImm = getV4X86ShuffleImm(Mask);
27202	return true;
27203	}
27204	if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
27205	SmallVector<int, 4> RepeatedMask;
27206	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27207	Shuffle = X86ISD::VPERMI;
27208	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27209	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27210	return true;
27211	}
27212	}
27213	return false;
27214	}
27215
27216	// VPERMILPD can permute with a non-repeating shuffle.
27217	if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27218	Shuffle = X86ISD::VPERMILPI;
27219	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27220	PermuteImm = 0;
27221	for (int i = 0, e = Mask.size(); i != e; ++i) {
27222	int M = Mask[i];
27223	if (M == SM_SentinelUndef)
27224	continue;
27225	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index" ) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27225, __PRETTY_FUNCTION__));
27226	PermuteImm \|= (M & 1) << i;
27227	}
27228	return true;
27229	}
27230
27231	// We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27232	SmallVector<int, 4> RepeatedMask;
27233	if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27234	return false;
27235
27236	// Narrow the repeated mask for 32-bit element permutes.
27237	SmallVector<int, 4> WordMask = RepeatedMask;
27238	if (MaskScalarSizeInBits == 64)
27239	scaleShuffleMask(2, RepeatedMask, WordMask);
27240
27241	Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27242	ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27243	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27244	PermuteImm = getV4X86ShuffleImm(WordMask);
27245	return true;
27246	}
27247
27248	// Attempt to match a combined unary shuffle mask against supported binary
27249	// shuffle instructions.
27250	// TODO: Investigate sharing more of this with shuffle lowering.
27251	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27252	bool AllowFloatDomain, bool AllowIntDomain,
27253	SDValue &V1, SDValue &V2, SDLoc &DL,
27254	SelectionDAG &DAG,
27255	const X86Subtarget &Subtarget,
27256	unsigned &Shuffle, MVT &ShuffleVT,
27257	bool IsUnary) {
27258	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27259
27260	if (MaskVT.is128BitVector()) {
27261	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27262	V2 = V1;
27263	Shuffle = X86ISD::MOVLHPS;
27264	ShuffleVT = MVT::v4f32;
27265	return true;
27266	}
27267	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27268	V2 = V1;
27269	Shuffle = X86ISD::MOVHLPS;
27270	ShuffleVT = MVT::v4f32;
27271	return true;
27272	}
27273	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27274	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
27275	std::swap(V1, V2);
27276	Shuffle = X86ISD::MOVSD;
27277	ShuffleVT = MaskVT;
27278	return true;
27279	}
27280	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27281	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
27282	Shuffle = X86ISD::MOVSS;
27283	ShuffleVT = MaskVT;
27284	return true;
27285	}
27286	}
27287
27288	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27289	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
27290	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
27291	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
27292	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
27293	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27294	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27295	DAG, Subtarget)) {
27296	ShuffleVT = MaskVT;
27297	if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27298	ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27299	return true;
27300	}
27301	}
27302
27303	return false;
27304	}
27305
27306	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27307	bool AllowFloatDomain,
27308	bool AllowIntDomain,
27309	SDValue &V1, SDValue &V2, SDLoc &DL,
27310	SelectionDAG &DAG,
27311	const X86Subtarget &Subtarget,
27312	unsigned &Shuffle, MVT &ShuffleVT,
27313	unsigned &PermuteImm) {
27314	unsigned NumMaskElts = Mask.size();
27315	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27316
27317	// Attempt to match against PALIGNR byte rotate.
27318	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
27319	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27320	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27321	if (0 < ByteRotation) {
27322	Shuffle = X86ISD::PALIGNR;
27323	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27324	PermuteImm = ByteRotation;
27325	return true;
27326	}
27327	}
27328
27329	// Attempt to combine to X86ISD::BLENDI.
27330	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
27331	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
27332	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27333	uint64_t BlendMask = 0;
27334	bool ForceV1Zero = false, ForceV2Zero = false;
27335	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27336	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27337	BlendMask)) {
27338	if (MaskVT == MVT::v16i16) {
27339	// We can only use v16i16 PBLENDW if the lanes are repeated.
27340	SmallVector<int, 8> RepeatedMask;
27341	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27342	RepeatedMask)) {
27343	assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27344, __PRETTY_FUNCTION__))
27344	"Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!" ) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27344, __PRETTY_FUNCTION__));
27345	PermuteImm = 0;
27346	for (int i = 0; i < 8; ++i)
27347	if (RepeatedMask[i] >= 8)
27348	PermuteImm \|= 1 << i;
27349	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27350	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27351	Shuffle = X86ISD::BLENDI;
27352	ShuffleVT = MaskVT;
27353	return true;
27354	}
27355	} else {
27356	// Determine a type compatible with X86ISD::BLENDI.
27357	ShuffleVT = MaskVT;
27358	if (Subtarget.hasAVX2()) {
27359	if (ShuffleVT == MVT::v4i64)
27360	ShuffleVT = MVT::v8i32;
27361	else if (ShuffleVT == MVT::v2i64)
27362	ShuffleVT = MVT::v4i32;
27363	} else {
27364	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
27365	ShuffleVT = MVT::v8i16;
27366	else if (ShuffleVT == MVT::v4i64)
27367	ShuffleVT = MVT::v4f64;
27368	else if (ShuffleVT == MVT::v8i32)
27369	ShuffleVT = MVT::v8f32;
27370	}
27371
27372	if (!ShuffleVT.isFloatingPoint()) {
27373	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27374	BlendMask =
27375	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27376	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27377	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27378	}
27379
27380	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27381	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27382	PermuteImm = (unsigned)BlendMask;
27383	Shuffle = X86ISD::BLENDI;
27384	return true;
27385	}
27386	}
27387	}
27388
27389	// Attempt to combine to INSERTPS.
27390	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27391	MaskVT.is128BitVector()) {
27392	APInt Zeroable(4, 0);
27393	for (unsigned i = 0; i != NumMaskElts; ++i)
27394	if (Mask[i] < 0)
27395	Zeroable.setBit(i);
27396
27397	if (Zeroable.getBoolValue() &&
27398	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27399	Shuffle = X86ISD::INSERTPS;
27400	ShuffleVT = MVT::v4f32;
27401	return true;
27402	}
27403	}
27404
27405	// Attempt to combine to SHUFPD.
27406	if (AllowFloatDomain && EltSizeInBits == 64 &&
27407	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
27408	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
27409	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27410	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27411	Shuffle = X86ISD::SHUFP;
27412	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27413	return true;
27414	}
27415	}
27416
27417	// Attempt to combine to SHUFPS.
27418	if (AllowFloatDomain && EltSizeInBits == 32 &&
27419	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
27420	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
27421	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27422	SmallVector<int, 4> RepeatedMask;
27423	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27424	// Match each half of the repeated mask, to determine if its just
27425	// referencing one of the vectors, is zeroable or entirely undef.
27426	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27427	int M0 = RepeatedMask[Offset];
27428	int M1 = RepeatedMask[Offset + 1];
27429
27430	if (isUndefInRange(RepeatedMask, Offset, 2)) {
27431	return DAG.getUNDEF(MaskVT);
27432	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27433	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27434	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27435	return getZeroVector(MaskVT, Subtarget, DAG, DL);
27436	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27437	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27438	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27439	return V1;
27440	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27441	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27442	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27443	return V2;
27444	}
27445
27446	return SDValue();
27447	};
27448
27449	int ShufMask[4] = {-1, -1, -1, -1};
27450	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27451	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27452
27453	if (Lo && Hi) {
27454	V1 = Lo;
27455	V2 = Hi;
27456	Shuffle = X86ISD::SHUFP;
27457	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27458	PermuteImm = getV4X86ShuffleImm(ShufMask);
27459	return true;
27460	}
27461	}
27462	}
27463
27464	return false;
27465	}
27466
27467	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
27468	/// possible.
27469	///
27470	/// This is the leaf of the recursive combine below. When we have found some
27471	/// chain of single-use x86 shuffle instructions and accumulated the combined
27472	/// shuffle mask represented by them, this will try to pattern match that mask
27473	/// into either a single instruction if there is a special purpose instruction
27474	/// for this operation, or into a PSHUFB instruction which is a fully general
27475	/// instruction but should only be used to replace chains over a certain depth.
27476	static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27477	ArrayRef<int> BaseMask, int Depth,
27478	bool HasVariableMask, SelectionDAG &DAG,
27479	TargetLowering::DAGCombinerInfo &DCI,
27480	const X86Subtarget &Subtarget) {
27481	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!" ) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27481, __PRETTY_FUNCTION__));
27482	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&(((Inputs.size() == 1 \|\| Inputs.size() == 2) && "Unexpected number of shuffle inputs!" ) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 \|\| Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27483, __PRETTY_FUNCTION__))
27483	"Unexpected number of shuffle inputs!")(((Inputs.size() == 1 \|\| Inputs.size() == 2) && "Unexpected number of shuffle inputs!" ) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 \|\| Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27483, __PRETTY_FUNCTION__));
27484
27485	// Find the inputs that enter the chain. Note that multiple uses are OK
27486	// here, we're not going to remove the operands we find.
27487	bool UnaryShuffle = (Inputs.size() == 1);
27488	SDValue V1 = peekThroughBitcasts(Inputs[0]);
27489	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27490	: peekThroughBitcasts(Inputs[1]));
27491
27492	MVT VT1 = V1.getSimpleValueType();
27493	MVT VT2 = V2.getSimpleValueType();
27494	MVT RootVT = Root.getSimpleValueType();
27495	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2 .getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch" ) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27497, __PRETTY_FUNCTION__))
27496	VT2.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2 .getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch" ) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27497, __PRETTY_FUNCTION__))
27497	"Vector size mismatch")((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2 .getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch" ) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27497, __PRETTY_FUNCTION__));
27498
27499	SDLoc DL(Root);
27500	SDValue Res;
27501
27502	unsigned NumBaseMaskElts = BaseMask.size();
27503	if (NumBaseMaskElts == 1) {
27504	assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!") ? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27504, __PRETTY_FUNCTION__));
27505	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27506	/AddTo/ true);
27507	return true;
27508	}
27509
27510	unsigned RootSizeInBits = RootVT.getSizeInBits();
27511	unsigned NumRootElts = RootVT.getVectorNumElements();
27512	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27513	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
27514	(RootVT.is256BitVector() && !Subtarget.hasAVX2());
27515
27516	// Don't combine if we are a AVX512/EVEX target and the mask element size
27517	// is different from the root element size - this would prevent writemasks
27518	// from being reused.
27519	// TODO - this currently prevents all lane shuffles from occurring.
27520	// TODO - check for writemasks usage instead of always preventing combining.
27521	// TODO - attempt to narrow Mask back to writemask size.
27522	bool IsEVEXShuffle =
27523	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);
27524	if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27525	return false;
27526
27527	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27528
27529	// Handle 128-bit lane shuffles of 256-bit vectors.
27530	// TODO - this should support binary shuffles.
27531	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27532	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27533	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27534	return false; // Nothing to do!
27535	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27536	unsigned PermMask = 0;
27537	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27538	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27539
27540	Res = DAG.getBitcast(ShuffleVT, V1);
27541	DCI.AddToWorklist(Res.getNode());
27542	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27543	DAG.getUNDEF(ShuffleVT),
27544	DAG.getConstant(PermMask, DL, MVT::i8));
27545	DCI.AddToWorklist(Res.getNode());
27546	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27547	/AddTo/ true);
27548	return true;
27549	}
27550
27551	// For masks that have been widened to 128-bit elements or more,
27552	// narrow back down to 64-bit elements.
27553	SmallVector<int, 64> Mask;
27554	if (BaseMaskEltSizeInBits > 64) {
27555	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size" ) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27555, __PRETTY_FUNCTION__));
27556	int MaskScale = BaseMaskEltSizeInBits / 64;
27557	scaleShuffleMask(MaskScale, BaseMask, Mask);
27558	} else {
27559	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27560	}
27561
27562	unsigned NumMaskElts = Mask.size();
27563	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27564
27565	// Determine the effective mask value type.
27566	FloatDomain &= (32 <= MaskEltSizeInBits);
27567	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27568	: MVT::getIntegerVT(MaskEltSizeInBits);
27569	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27570
27571	// Only allow legal mask types.
27572	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27573	return false;
27574
27575	// Attempt to match the mask against known shuffle patterns.
27576	MVT ShuffleSrcVT, ShuffleVT;
27577	unsigned Shuffle, PermuteImm;
27578
27579	// Which shuffle domains are permitted?
27580	// Permit domain crossing at higher combine depths.
27581	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
27582	bool AllowIntDomain = !FloatDomain \|\| (Depth > 3);
27583
27584	if (UnaryShuffle) {
27585	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27586	// directly if we don't shuffle the lower element and we shuffle the upper
27587	// (zero) elements within themselves.
27588	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27589	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27590	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27591	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27592	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27593	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27594	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27595	/AddTo/ true);
27596	return true;
27597	}
27598	}
27599
27600	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27601	V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27602	ShuffleVT)) {
27603	if (Depth == 1 && Root.getOpcode() == Shuffle)
27604	return false; // Nothing to do!
27605	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27606	return false; // AVX512 Writemask clash.
27607	Res = DAG.getBitcast(ShuffleSrcVT, V1);
27608	DCI.AddToWorklist(Res.getNode());
27609	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27610	DCI.AddToWorklist(Res.getNode());
27611	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27612	/AddTo/ true);
27613	return true;
27614	}
27615
27616	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27617	AllowIntDomain, Subtarget, Shuffle,
27618	ShuffleVT, PermuteImm)) {
27619	if (Depth == 1 && Root.getOpcode() == Shuffle)
27620	return false; // Nothing to do!
27621	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27622	return false; // AVX512 Writemask clash.
27623	Res = DAG.getBitcast(ShuffleVT, V1);
27624	DCI.AddToWorklist(Res.getNode());
27625	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27626	DAG.getConstant(PermuteImm, DL, MVT::i8));
27627	DCI.AddToWorklist(Res.getNode());
27628	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27629	/AddTo/ true);
27630	return true;
27631	}
27632	}
27633
27634	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27635	V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27636	UnaryShuffle)) {
27637	if (Depth == 1 && Root.getOpcode() == Shuffle)
27638	return false; // Nothing to do!
27639	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27640	return false; // AVX512 Writemask clash.
27641	V1 = DAG.getBitcast(ShuffleVT, V1);
27642	DCI.AddToWorklist(V1.getNode());
27643	V2 = DAG.getBitcast(ShuffleVT, V2);
27644	DCI.AddToWorklist(V2.getNode());
27645	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27646	DCI.AddToWorklist(Res.getNode());
27647	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27648	/AddTo/ true);
27649	return true;
27650	}
27651
27652	if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27653	AllowIntDomain, V1, V2, DL, DAG,
27654	Subtarget, Shuffle, ShuffleVT,
27655	PermuteImm)) {
27656	if (Depth == 1 && Root.getOpcode() == Shuffle)
27657	return false; // Nothing to do!
27658	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27659	return false; // AVX512 Writemask clash.
27660	V1 = DAG.getBitcast(ShuffleVT, V1);
27661	DCI.AddToWorklist(V1.getNode());
27662	V2 = DAG.getBitcast(ShuffleVT, V2);
27663	DCI.AddToWorklist(V2.getNode());
27664	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27665	DAG.getConstant(PermuteImm, DL, MVT::i8));
27666	DCI.AddToWorklist(Res.getNode());
27667	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27668	/AddTo/ true);
27669	return true;
27670	}
27671
27672	// Don't try to re-form single instruction chains under any circumstances now
27673	// that we've done encoding canonicalization for them.
27674	if (Depth < 2)
27675	return false;
27676
27677	bool MaskContainsZeros =
27678	any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27679
27680	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27681	// If we have a single input lane-crossing shuffle then lower to VPERMV.
27682	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
27683	((Subtarget.hasAVX2() &&
27684	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
27685	(Subtarget.hasAVX512() &&
27686	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
27687	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
27688	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
27689	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
27690	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
27691	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27692	MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27693	MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27694	SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27695	DCI.AddToWorklist(VPermMask.getNode());
27696	Res = DAG.getBitcast(MaskVT, V1);
27697	DCI.AddToWorklist(Res.getNode());
27698	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27699	DCI.AddToWorklist(Res.getNode());
27700	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27701	/AddTo/ true);
27702	return true;
27703	}
27704
27705	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27706	// vector as the second source.
27707	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
27708	((Subtarget.hasAVX512() &&
27709	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
27710	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
27711	(Subtarget.hasVLX() &&
27712	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
27713	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
27714	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
27715	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
27716	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
27717	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27718	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
27719	for (unsigned i = 0; i != NumMaskElts; ++i)
27720	if (Mask[i] == SM_SentinelZero)
27721	Mask[i] = NumMaskElts + i;
27722
27723	MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27724	MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27725	SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27726	DCI.AddToWorklist(VPermMask.getNode());
27727	Res = DAG.getBitcast(MaskVT, V1);
27728	DCI.AddToWorklist(Res.getNode());
27729	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27730	DCI.AddToWorklist(Zero.getNode());
27731	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27732	DCI.AddToWorklist(Res.getNode());
27733	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27734	/AddTo/ true);
27735	return true;
27736	}
27737
27738	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27739	if ((Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
27740	((Subtarget.hasAVX512() &&
27741	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
27742	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
27743	(Subtarget.hasVLX() &&
27744	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
27745	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
27746	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
27747	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
27748	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
27749	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27750	MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27751	MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27752	SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27753	DCI.AddToWorklist(VPermMask.getNode());
27754	V1 = DAG.getBitcast(MaskVT, V1);
27755	DCI.AddToWorklist(V1.getNode());
27756	V2 = DAG.getBitcast(MaskVT, V2);
27757	DCI.AddToWorklist(V2.getNode());
27758	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27759	DCI.AddToWorklist(Res.getNode());
27760	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27761	/AddTo/ true);
27762	return true;
27763	}
27764	return false;
27765	}
27766
27767	// See if we can combine a single input shuffle with zeros to a bit-mask,
27768	// which is much simpler than any shuffle.
27769	if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 \|\| HasVariableMask) &&
27770	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27771	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27772	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27773	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27774	APInt UndefElts(NumMaskElts, 0);
27775	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27776	for (unsigned i = 0; i != NumMaskElts; ++i) {
27777	int M = Mask[i];
27778	if (M == SM_SentinelUndef) {
27779	UndefElts.setBit(i);
27780	continue;
27781	}
27782	if (M == SM_SentinelZero)
27783	continue;
27784	EltBits[i] = AllOnes;
27785	}
27786	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27787	DCI.AddToWorklist(BitMask.getNode());
27788	Res = DAG.getBitcast(MaskVT, V1);
27789	DCI.AddToWorklist(Res.getNode());
27790	unsigned AndOpcode =
27791	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27792	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27793	DCI.AddToWorklist(Res.getNode());
27794	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27795	/AddTo/ true);
27796	return true;
27797	}
27798
27799	// If we have a single input shuffle with different shuffle patterns in the
27800	// the 128-bit lanes use the variable mask to VPERMILPS.
27801	// TODO Combine other mask types at higher depths.
27802	if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27803	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
27804	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27805	SmallVector<SDValue, 16> VPermIdx;
27806	for (int M : Mask) {
27807	SDValue Idx =
27808	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27809	VPermIdx.push_back(Idx);
27810	}
27811	MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27812	SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27813	DCI.AddToWorklist(VPermMask.getNode());
27814	Res = DAG.getBitcast(MaskVT, V1);
27815	DCI.AddToWorklist(Res.getNode());
27816	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27817	DCI.AddToWorklist(Res.getNode());
27818	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27819	/AddTo/ true);
27820	return true;
27821	}
27822
27823	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27824	// to VPERMIL2PD/VPERMIL2PS.
27825	if ((Depth >= 3 \|\| HasVariableMask) && Subtarget.hasXOP() &&
27826	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
27827	MaskVT == MVT::v8f32)) {
27828	// VPERMIL2 Operation.
27829	// Bits[3] - Match Bit.
27830	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
27831	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
27832	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27833	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27834	SmallVector<int, 8> VPerm2Idx;
27835	MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27836	MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27837	unsigned M2ZImm = 0;
27838	for (int M : Mask) {
27839	if (M == SM_SentinelUndef) {
27840	VPerm2Idx.push_back(-1);
27841	continue;
27842	}
27843	if (M == SM_SentinelZero) {
27844	M2ZImm = 2;
27845	VPerm2Idx.push_back(8);
27846	continue;
27847	}
27848	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27849	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27850	VPerm2Idx.push_back(Index);
27851	}
27852	V1 = DAG.getBitcast(MaskVT, V1);
27853	DCI.AddToWorklist(V1.getNode());
27854	V2 = DAG.getBitcast(MaskVT, V2);
27855	DCI.AddToWorklist(V2.getNode());
27856	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27857	DCI.AddToWorklist(VPerm2MaskOp.getNode());
27858	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27859	DAG.getConstant(M2ZImm, DL, MVT::i8));
27860	DCI.AddToWorklist(Res.getNode());
27861	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27862	/AddTo/ true);
27863	return true;
27864	}
27865
27866	// If we have 3 or more shuffle instructions or a chain involving a variable
27867	// mask, we can replace them with a single PSHUFB instruction profitably.
27868	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
27869	// instructions, but in practice PSHUFB tends to be very fast so we're
27870	// more aggressive.
27871	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
27872	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
27873	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
27874	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27875	SmallVector<SDValue, 16> PSHUFBMask;
27876	int NumBytes = RootVT.getSizeInBits() / 8;
27877	int Ratio = NumBytes / NumMaskElts;
27878	for (int i = 0; i < NumBytes; ++i) {
27879	int M = Mask[i / Ratio];
27880	if (M == SM_SentinelUndef) {
27881	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27882	continue;
27883	}
27884	if (M == SM_SentinelZero) {
27885	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27886	continue;
27887	}
27888	M = Ratio * M + i % Ratio;
27889	assert ((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ? static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27889, __PRETTY_FUNCTION__));
27890	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27891	}
27892	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27893	Res = DAG.getBitcast(ByteVT, V1);
27894	DCI.AddToWorklist(Res.getNode());
27895	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27896	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27897	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27898	DCI.AddToWorklist(Res.getNode());
27899	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27900	/AddTo/ true);
27901	return true;
27902	}
27903
27904	// With XOP, if we have a 128-bit binary input shuffle we can always combine
27905	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27906	// slower than PSHUFB on targets that support both.
27907	if ((Depth >= 3 \|\| HasVariableMask) && RootVT.is128BitVector() &&
27908	Subtarget.hasXOP()) {
27909	// VPPERM Mask Operation
27910	// Bits[4:0] - Byte Index (0 - 31)
27911	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27912	SmallVector<SDValue, 16> VPPERMMask;
27913	int NumBytes = 16;
27914	int Ratio = NumBytes / NumMaskElts;
27915	for (int i = 0; i < NumBytes; ++i) {
27916	int M = Mask[i / Ratio];
27917	if (M == SM_SentinelUndef) {
27918	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27919	continue;
27920	}
27921	if (M == SM_SentinelZero) {
27922	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27923	continue;
27924	}
27925	M = Ratio * M + i % Ratio;
27926	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27927	}
27928	MVT ByteVT = MVT::v16i8;
27929	V1 = DAG.getBitcast(ByteVT, V1);
27930	DCI.AddToWorklist(V1.getNode());
27931	V2 = DAG.getBitcast(ByteVT, V2);
27932	DCI.AddToWorklist(V2.getNode());
27933	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27934	DCI.AddToWorklist(VPPERMMaskOp.getNode());
27935	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27936	DCI.AddToWorklist(Res.getNode());
27937	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27938	/AddTo/ true);
27939	return true;
27940	}
27941
27942	// Failed to find any combines.
27943	return false;
27944	}
27945
27946	// Attempt to constant fold all of the constant source ops.
27947	// Returns true if the entire shuffle is folded to a constant.
27948	// TODO: Extend this to merge multiple constant Ops and update the mask.
27949	static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27950	ArrayRef<int> Mask, SDValue Root,
27951	bool HasVariableMask, SelectionDAG &DAG,
27952	TargetLowering::DAGCombinerInfo &DCI,
27953	const X86Subtarget &Subtarget) {
27954	MVT VT = Root.getSimpleValueType();
27955
27956	unsigned SizeInBits = VT.getSizeInBits();
27957	unsigned NumMaskElts = Mask.size();
27958	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27959	unsigned NumOps = Ops.size();
27960
27961	// Extract constant bits from each source op.
27962	bool OneUseConstantOp = false;
27963	SmallVector<APInt, 16> UndefEltsOps(NumOps);
27964	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27965	for (unsigned i = 0; i != NumOps; ++i) {
27966	SDValue SrcOp = Ops[i];
27967	OneUseConstantOp \|= SrcOp.hasOneUse();
27968	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27969	RawBitsOps[i]))
27970	return false;
27971	}
27972
27973	// Only fold if at least one of the constants is only used once or
27974	// the combined shuffle has included a variable mask shuffle, this
27975	// is to avoid constant pool bloat.
27976	if (!OneUseConstantOp && !HasVariableMask)
27977	return false;
27978
27979	// Shuffle the constant bits according to the mask.
27980	APInt UndefElts(NumMaskElts, 0);
27981	APInt ZeroElts(NumMaskElts, 0);
27982	APInt ConstantElts(NumMaskElts, 0);
27983	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27984	APInt::getNullValue(MaskSizeInBits));
27985	for (unsigned i = 0; i != NumMaskElts; ++i) {
27986	int M = Mask[i];
27987	if (M == SM_SentinelUndef) {
27988	UndefElts.setBit(i);
27989	continue;
27990	} else if (M == SM_SentinelZero) {
27991	ZeroElts.setBit(i);
27992	continue;
27993	}
27994	assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast <void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 27994, __PRETTY_FUNCTION__));
27995
27996	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27997	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27998
27999	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28000	if (SrcUndefElts[SrcMaskIdx]) {
28001	UndefElts.setBit(i);
28002	continue;
28003	}
28004
28005	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28006	APInt &Bits = SrcEltBits[SrcMaskIdx];
28007	if (!Bits) {
28008	ZeroElts.setBit(i);
28009	continue;
28010	}
28011
28012	ConstantElts.setBit(i);
28013	ConstantBitData[i] = Bits;
28014	}
28015	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue())(((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue()) ? static_cast <void> (0) : __assert_fail ("(UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28015, __PRETTY_FUNCTION__));
28016
28017	// Create the constant data.
28018	MVT MaskSVT;
28019	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
28020	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28021	else
28022	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28023
28024	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28025
28026	SDLoc DL(Root);
28027	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28028	DCI.AddToWorklist(CstOp.getNode());
28029	DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28030	return true;
28031	}
28032
28033	/// \brief Fully generic combining of x86 shuffle instructions.
28034	///
28035	/// This should be the last combine run over the x86 shuffle instructions. Once
28036	/// they have been fully optimized, this will recursively consider all chains
28037	/// of single-use shuffle instructions, build a generic model of the cumulative
28038	/// shuffle operation, and check for simpler instructions which implement this
28039	/// operation. We use this primarily for two purposes:
28040	///
28041	/// 1) Collapse generic shuffles to specialized single instructions when
28042	/// equivalent. In most cases, this is just an encoding size win, but
28043	/// sometimes we will collapse multiple generic shuffles into a single
28044	/// special-purpose shuffle.
28045	/// 2) Look for sequences of shuffle instructions with 3 or more total
28046	/// instructions, and replace them with the slightly more expensive SSSE3
28047	/// PSHUFB instruction if available. We do this as the last combining step
28048	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
28049	/// a suitable short sequence of other instructions. The PSHUFB will either
28050	/// use a register or have to read from memory and so is slightly (but only
28051	/// slightly) more expensive than the other shuffle instructions.
28052	///
28053	/// Because this is inherently a quadratic operation (for each shuffle in
28054	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28055	/// This should never be an issue in practice as the shuffle lowering doesn't
28056	/// produce sequences of more than 8 instructions.
28057	///
28058	/// FIXME: We will currently miss some cases where the redundant shuffling
28059	/// would simplify under the threshold for PSHUFB formation because of
28060	/// combine-ordering. To fix this, we should do the redundant instruction
28061	/// combining in this recursive walk.
28062	static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28063	int SrcOpIndex, SDValue Root,
28064	ArrayRef<int> RootMask,
28065	ArrayRef<const SDNode*> SrcNodes,
28066	int Depth, bool HasVariableMask,
28067	SelectionDAG &DAG,
28068	TargetLowering::DAGCombinerInfo &DCI,
28069	const X86Subtarget &Subtarget) {
28070	// Bound the depth of our recursive combine because this is ultimately
28071	// quadratic in nature.
28072	if (Depth > 8)
28073	return false;
28074
28075	// Directly rip through bitcasts to find the underlying operand.
28076	SDValue Op = SrcOps[SrcOpIndex];
28077	Op = peekThroughOneUseBitcasts(Op);
28078
28079	MVT VT = Op.getSimpleValueType();
28080	if (!VT.isVector())
28081	return false; // Bail if we hit a non-vector.
28082
28083	assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!" ) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28084, __PRETTY_FUNCTION__))
28084	"Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!" ) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28084, __PRETTY_FUNCTION__));
28085	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits () && "Can only combine shuffles of the same vector register size." ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28086, __PRETTY_FUNCTION__))
28086	"Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits () && "Can only combine shuffles of the same vector register size." ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28086, __PRETTY_FUNCTION__));
28087
28088	// Extract target shuffle mask and resolve sentinels and inputs.
28089	SmallVector<int, 64> OpMask;
28090	SmallVector<SDValue, 2> OpInputs;
28091	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28092	return false;
28093
28094	assert(OpInputs.size() <= 2 && "Too many shuffle inputs")((OpInputs.size() <= 2 && "Too many shuffle inputs" ) ? static_cast<void> (0) : __assert_fail ("OpInputs.size() <= 2 && \"Too many shuffle inputs\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28094, __PRETTY_FUNCTION__));
28095	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28096	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28097
28098	// Add the inputs to the Ops list, avoiding duplicates.
28099	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28100
28101	int InputIdx0 = -1, InputIdx1 = -1;
28102	for (int i = 0, e = Ops.size(); i < e; ++i) {
28103	SDValue BC = peekThroughBitcasts(Ops[i]);
28104	if (Input0 && BC == peekThroughBitcasts(Input0))
28105	InputIdx0 = i;
28106	if (Input1 && BC == peekThroughBitcasts(Input1))
28107	InputIdx1 = i;
28108	}
28109
28110	if (Input0 && InputIdx0 < 0) {
28111	InputIdx0 = SrcOpIndex;
28112	Ops[SrcOpIndex] = Input0;
28113	}
28114	if (Input1 && InputIdx1 < 0) {
28115	InputIdx1 = Ops.size();
28116	Ops.push_back(Input1);
28117	}
28118
28119	assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__))
28120	RootMask.size() % OpMask.size() == 0) \|\|((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__))
28121	(OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__))
28122	OpMask.size() % RootMask.size() == 0) \|\|((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__))
28123	OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__))
28124	"The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size () % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size () && OpMask.size() % RootMask.size() == 0) \|\| OpMask .size() == RootMask.size()) && "The smaller number of elements must divide the larger." ) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) \|\| (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) \|\| OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28124, __PRETTY_FUNCTION__));
28125
28126	// This function can be performance-critical, so we rely on the power-of-2
28127	// knowledge that we have about the mask sizes to replace div/rem ops with
28128	// bit-masks and shifts.
28129	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28129, __PRETTY_FUNCTION__));
28130	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28130, __PRETTY_FUNCTION__));
28131	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28132	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28133
28134	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28135	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28136	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28137	assert((RootRatio == 1 \|\| OpRatio == 1) &&(((RootRatio == 1 \|\| OpRatio == 1) && "Must not have a ratio for both incoming and op masks!" ) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 \|\| OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28138, __PRETTY_FUNCTION__))
28138	"Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 \|\| OpRatio == 1) && "Must not have a ratio for both incoming and op masks!" ) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 \|\| OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28138, __PRETTY_FUNCTION__));
28139
28140	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28140, __PRETTY_FUNCTION__));
28141	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28141, __PRETTY_FUNCTION__));
28142	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28142, __PRETTY_FUNCTION__));
28143	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28144	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28145
28146	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28147
28148	// Merge this shuffle operation's mask into our accumulated mask. Note that
28149	// this shuffle's mask will be the first applied to the input, followed by the
28150	// root mask to get us all the way to the root value arrangement. The reason
28151	// for this order is that we are recursing up the operation chain.
28152	for (unsigned i = 0; i < MaskWidth; ++i) {
28153	unsigned RootIdx = i >> RootRatioLog2;
28154	if (RootMask[RootIdx] < 0) {
28155	// This is a zero or undef lane, we're done.
28156	Mask[i] = RootMask[RootIdx];
28157	continue;
28158	}
28159
28160	unsigned RootMaskedIdx =
28161	RootRatio == 1
28162	? RootMask[RootIdx]
28163	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28164
28165	// Just insert the scaled root mask value if it references an input other
28166	// than the SrcOp we're currently inserting.
28167	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
28168	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28169	Mask[i] = RootMaskedIdx;
28170	continue;
28171	}
28172
28173	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28174	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28175	if (OpMask[OpIdx] < 0) {
28176	// The incoming lanes are zero or undef, it doesn't matter which ones we
28177	// are using.
28178	Mask[i] = OpMask[OpIdx];
28179	continue;
28180	}
28181
28182	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28183	unsigned OpMaskedIdx =
28184	OpRatio == 1
28185	? OpMask[OpIdx]
28186	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28187
28188	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28189	if (OpMask[OpIdx] < (int)OpMask.size()) {
28190	assert(0 <= InputIdx0 && "Unknown target shuffle input")((0 <= InputIdx0 && "Unknown target shuffle input" ) ? static_cast<void> (0) : __assert_fail ("0 <= InputIdx0 && \"Unknown target shuffle input\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28190, __PRETTY_FUNCTION__));
28191	OpMaskedIdx += InputIdx0 * MaskWidth;
28192	} else {
28193	assert(0 <= InputIdx1 && "Unknown target shuffle input")((0 <= InputIdx1 && "Unknown target shuffle input" ) ? static_cast<void> (0) : __assert_fail ("0 <= InputIdx1 && \"Unknown target shuffle input\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28193, __PRETTY_FUNCTION__));
28194	OpMaskedIdx += InputIdx1 * MaskWidth;
28195	}
28196
28197	Mask[i] = OpMaskedIdx;
28198	}
28199
28200	// Handle the all undef/zero cases early.
28201	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28202	DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28203	return true;
28204	}
28205	if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28206	// TODO - should we handle the mixed zero/undef case as well? Just returning
28207	// a zero mask will lose information on undef elements possibly reducing
28208	// future combine possibilities.
28209	DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28210	Subtarget, DAG, SDLoc(Root)));
28211	return true;
28212	}
28213
28214	// Remove unused shuffle source ops.
28215	resolveTargetShuffleInputsAndMask(Ops, Mask);
28216	assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ? static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28216, __PRETTY_FUNCTION__));
28217
28218	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());
28219
28220	// Update the list of shuffle nodes that have been combined so far.
28221	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28222	SrcNodes.end());
28223	CombinedNodes.push_back(Op.getNode());
28224
28225	// See if we can recurse into each shuffle source op (if it's a target
28226	// shuffle). The source op should only be combined if it either has a
28227	// single use (i.e. current Op) or all its users have already been combined.
28228	for (int i = 0, e = Ops.size(); i < e; ++i)
28229	if (Ops[i].getNode()->hasOneUse() \|\|
28230	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28231	if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28232	Depth + 1, HasVariableMask, DAG, DCI,
28233	Subtarget))
28234	return true;
28235
28236	// Attempt to constant fold all of the constant source ops.
28237	if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28238	Subtarget))
28239	return true;
28240
28241	// We can only combine unary and binary shuffle mask cases.
28242	if (Ops.size() > 2)
28243	return false;
28244
28245	// Minor canonicalization of the accumulated shuffle mask to make it easier
28246	// to match below. All this does is detect masks with sequential pairs of
28247	// elements, and shrink them to the half-width mask. It does this in a loop
28248	// so it will reduce the size of the mask to the minimal width mask which
28249	// performs an equivalent shuffle.
28250	SmallVector<int, 64> WidenedMask;
28251	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28252	Mask = std::move(WidenedMask);
28253	}
28254
28255	// Canonicalization of binary shuffle masks to improve pattern matching by
28256	// commuting the inputs.
28257	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28258	ShuffleVectorSDNode::commuteMask(Mask);
28259	std::swap(Ops[0], Ops[1]);
28260	}
28261
28262	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28263	DCI, Subtarget);
28264	}
28265
28266	/// \brief Get the PSHUF-style mask from PSHUF node.
28267	///
28268	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28269	/// PSHUF-style masks that can be reused with such instructions.
28270	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28271	MVT VT = N.getSimpleValueType();
28272	SmallVector<int, 4> Mask;
28273	SmallVector<SDValue, 2> Ops;
28274	bool IsUnary;
28275	bool HaveMask =
28276	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28277	(void)HaveMask;
28278	assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28278, __PRETTY_FUNCTION__));
28279
28280	// If we have more than 128-bits, only the low 128-bits of shuffle mask
28281	// matter. Check that the upper masks are repeats and remove them.
28282	if (VT.getSizeInBits() > 128) {
28283	int LaneElts = 128 / VT.getScalarSizeInBits();
28284	#ifndef NDEBUG
28285	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28286	for (int j = 0; j < LaneElts; ++j)
28287	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!") ? static_cast< void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28288, __PRETTY_FUNCTION__))
28288	"Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!") ? static_cast< void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28288, __PRETTY_FUNCTION__));
28289	#endif
28290	Mask.resize(LaneElts);
28291	}
28292
28293	switch (N.getOpcode()) {
28294	case X86ISD::PSHUFD:
28295	return Mask;
28296	case X86ISD::PSHUFLW:
28297	Mask.resize(4);
28298	return Mask;
28299	case X86ISD::PSHUFHW:
28300	Mask.erase(Mask.begin(), Mask.begin() + 4);
28301	for (int &M : Mask)
28302	M -= 4;
28303	return Mask;
28304	default:
28305	llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28305);
28306	}
28307	}
28308
28309	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
28310	///
28311	/// We walk up the chain and look for a combinable shuffle, skipping over
28312	/// shuffles that we could hoist this shuffle's transformation past without
28313	/// altering anything.
28314	static SDValue
28315	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28316	SelectionDAG &DAG) {
28317	assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!" ) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28318, __PRETTY_FUNCTION__))
28318	"Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!" ) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28318, __PRETTY_FUNCTION__));
28319	SDLoc DL(N);
28320
28321	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28322	// of the shuffles in the chain so that we can form a fresh chain to replace
28323	// this one.
28324	SmallVector<SDValue, 8> Chain;
28325	SDValue V = N.getOperand(0);
28326	for (; V.hasOneUse(); V = V.getOperand(0)) {
28327	switch (V.getOpcode()) {
28328	default:
28329	return SDValue(); // Nothing combined!
28330
28331	case ISD::BITCAST:
28332	// Skip bitcasts as we always know the type for the target specific
28333	// instructions.
28334	continue;
28335
28336	case X86ISD::PSHUFD:
28337	// Found another dword shuffle.
28338	break;
28339
28340	case X86ISD::PSHUFLW:
28341	// Check that the low words (being shuffled) are the identity in the
28342	// dword shuffle, and the high words are self-contained.
28343	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
28344	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28345	return SDValue();
28346
28347	Chain.push_back(V);
28348	continue;
28349
28350	case X86ISD::PSHUFHW:
28351	// Check that the high words (being shuffled) are the identity in the
28352	// dword shuffle, and the low words are self-contained.
28353	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
28354	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28355	return SDValue();
28356
28357	Chain.push_back(V);
28358	continue;
28359
28360	case X86ISD::UNPCKL:
28361	case X86ISD::UNPCKH:
28362	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28363	// shuffle into a preceding word shuffle.
28364	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28365	V.getSimpleValueType().getVectorElementType() != MVT::i16)
28366	return SDValue();
28367
28368	// Search for a half-shuffle which we can combine with.
28369	unsigned CombineOp =
28370	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28371	if (V.getOperand(0) != V.getOperand(1) \|\|
28372	!V->isOnlyUserOf(V.getOperand(0).getNode()))
28373	return SDValue();
28374	Chain.push_back(V);
28375	V = V.getOperand(0);
28376	do {
28377	switch (V.getOpcode()) {
28378	default:
28379	return SDValue(); // Nothing to combine.
28380
28381	case X86ISD::PSHUFLW:
28382	case X86ISD::PSHUFHW:
28383	if (V.getOpcode() == CombineOp)
28384	break;
28385
28386	Chain.push_back(V);
28387
28388	LLVM_FALLTHROUGH[[clang::fallthrough]];
28389	case ISD::BITCAST:
28390	V = V.getOperand(0);
28391	continue;
28392	}
28393	break;
28394	} while (V.hasOneUse());
28395	break;
28396	}
28397	// Break out of the loop if we break out of the switch.
28398	break;
28399	}
28400
28401	if (!V.hasOneUse())
28402	// We fell out of the loop without finding a viable combining instruction.
28403	return SDValue();
28404
28405	// Merge this node's mask and our incoming mask.
28406	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28407	for (int &M : Mask)
28408	M = VMask[M];
28409	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28410	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28411
28412	// Rebuild the chain around this new shuffle.
28413	while (!Chain.empty()) {
28414	SDValue W = Chain.pop_back_val();
28415
28416	if (V.getValueType() != W.getOperand(0).getValueType())
28417	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28418
28419	switch (W.getOpcode()) {
28420	default:
28421	llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28421);
28422
28423	case X86ISD::UNPCKL:
28424	case X86ISD::UNPCKH:
28425	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28426	break;
28427
28428	case X86ISD::PSHUFD:
28429	case X86ISD::PSHUFLW:
28430	case X86ISD::PSHUFHW:
28431	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28432	break;
28433	}
28434	}
28435	if (V.getValueType() != N.getValueType())
28436	V = DAG.getBitcast(N.getValueType(), V);
28437
28438	// Return the new chain to replace N.
28439	return V;
28440	}
28441
28442	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28443	/// pshufhw.
28444	///
28445	/// We walk up the chain, skipping shuffles of the other half and looking
28446	/// through shuffles which switch halves trying to find a shuffle of the same
28447	/// pair of dwords.
28448	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28449	SelectionDAG &DAG,
28450	TargetLowering::DAGCombinerInfo &DCI) {
28451	assert((((N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD ::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!" ) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28453, __PRETTY_FUNCTION__))
28452	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&(((N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD ::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!" ) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28453, __PRETTY_FUNCTION__))
28453	"Called with something other than an x86 128-bit half shuffle!")(((N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD ::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!" ) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28453, __PRETTY_FUNCTION__));
28454	SDLoc DL(N);
28455	unsigned CombineOpcode = N.getOpcode();
28456
28457	// Walk up a single-use chain looking for a combinable shuffle.
28458	SDValue V = N.getOperand(0);
28459	for (; V.hasOneUse(); V = V.getOperand(0)) {
28460	switch (V.getOpcode()) {
28461	default:
28462	return false; // Nothing combined!
28463
28464	case ISD::BITCAST:
28465	// Skip bitcasts as we always know the type for the target specific
28466	// instructions.
28467	continue;
28468
28469	case X86ISD::PSHUFLW:
28470	case X86ISD::PSHUFHW:
28471	if (V.getOpcode() == CombineOpcode)
28472	break;
28473
28474	// Other-half shuffles are no-ops.
28475	continue;
28476	}
28477	// Break out of the loop if we break out of the switch.
28478	break;
28479	}
28480
28481	if (!V.hasOneUse())
28482	// We fell out of the loop without finding a viable combining instruction.
28483	return false;
28484
28485	// Combine away the bottom node as its shuffle will be accumulated into
28486	// a preceding shuffle.
28487	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);
28488
28489	// Record the old value.
28490	SDValue Old = V;
28491
28492	// Merge this node's mask and our incoming mask (adjusted to account for all
28493	// the pshufd instructions encountered).
28494	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28495	for (int &M : Mask)
28496	M = VMask[M];
28497	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28498	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28499
28500	// Check that the shuffles didn't cancel each other out. If not, we need to
28501	// combine to the new one.
28502	if (Old != V)
28503	// Replace the combinable shuffle with the combined one, updating all users
28504	// so that we re-evaluate the chain here.
28505	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);
28506
28507	return true;
28508	}
28509
28510	/// \brief Try to combine x86 target specific shuffles.
28511	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28512	TargetLowering::DAGCombinerInfo &DCI,
28513	const X86Subtarget &Subtarget) {
28514	SDLoc DL(N);
28515	MVT VT = N.getSimpleValueType();
28516	SmallVector<int, 4> Mask;
28517
28518	unsigned Opcode = N.getOpcode();
28519	switch (Opcode) {
28520	case X86ISD::PSHUFD:
28521	case X86ISD::PSHUFLW:
28522	case X86ISD::PSHUFHW:
28523	Mask = getPSHUFShuffleMask(N);
28524	assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28524, __PRETTY_FUNCTION__));
28525	break;
28526	case X86ISD::UNPCKL: {
28527	auto Op0 = N.getOperand(0);
28528	auto Op1 = N.getOperand(1);
28529	unsigned Opcode0 = Op0.getOpcode();
28530	unsigned Opcode1 = Op1.getOpcode();
28531
28532	// Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28533	// X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28534	// TODO: Add other horizontal operations as required.
28535	if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28536	return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28537
28538	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28539	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28540	// moves upper half elements into the lower half part. For example:
28541	//
28542	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28543	// undef:v16i8
28544	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28545	//
28546	// will be combined to:
28547	//
28548	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28549
28550	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28551	// happen due to advanced instructions.
28552	if (!VT.is128BitVector())
28553	return SDValue();
28554
28555	if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28556	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28557
28558	unsigned NumElts = VT.getVectorNumElements();
28559	SmallVector<int, 8> ExpectedMask(NumElts, -1);
28560	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28561	NumElts / 2);
28562
28563	auto ShufOp = Op1.getOperand(0);
28564	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28565	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28566	}
28567	return SDValue();
28568	}
28569	case X86ISD::BLENDI: {
28570	SDValue V0 = N->getOperand(0);
28571	SDValue V1 = N->getOperand(1);
28572	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&((VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType () && "Unexpected input vector types") ? static_cast< void> (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28573, __PRETTY_FUNCTION__))
28573	"Unexpected input vector types")((VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType () && "Unexpected input vector types") ? static_cast< void> (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28573, __PRETTY_FUNCTION__));
28574
28575	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28576	// operands and changing the mask to 1. This saves us a bunch of
28577	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
28578	// x86InstrInfo knows how to commute this back after instruction selection
28579	// if it would help register allocation.
28580
28581	// TODO: If optimizing for size or a processor that doesn't suffer from
28582	// partial register update stalls, this should be transformed into a MOVSD
28583	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28584
28585	if (VT == MVT::v2f64)
28586	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28587	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28588	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28589	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28590	}
28591
28592	return SDValue();
28593	}
28594	case X86ISD::MOVSD:
28595	case X86ISD::MOVSS: {
28596	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28597	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28598	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28599	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28600	if (isZero0 && isZero1)
28601	return SDValue();
28602
28603	// We often lower to MOVSD/MOVSS from integer as well as native float
28604	// types; remove unnecessary domain-crossing bitcasts if we can to make it
28605	// easier to combine shuffles later on. We've already accounted for the
28606	// domain switching cost when we decided to lower with it.
28607	bool isFloat = VT.isFloatingPoint();
28608	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28609	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28610	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
28611	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28612	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28613	V0 = DAG.getBitcast(NewVT, V0);
28614	V1 = DAG.getBitcast(NewVT, V1);
28615	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28616	}
28617
28618	return SDValue();
28619	}
28620	case X86ISD::INSERTPS: {
28621	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32" ) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28621, __PRETTY_FUNCTION__));
28622	SDValue Op0 = N.getOperand(0);
28623	SDValue Op1 = N.getOperand(1);
28624	SDValue Op2 = N.getOperand(2);
28625	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28626	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28627	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28628	unsigned ZeroMask = InsertPSMask & 0xF;
28629
28630	// If we zero out all elements from Op0 then we don't need to reference it.
28631	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28632	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28633	DAG.getConstant(InsertPSMask, DL, MVT::i8));
28634
28635	// If we zero out the element from Op1 then we don't need to reference it.
28636	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28637	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28638	DAG.getConstant(InsertPSMask, DL, MVT::i8));
28639
28640	// Attempt to merge insertps Op1 with an inner target shuffle node.
28641	SmallVector<int, 8> TargetMask1;
28642	SmallVector<SDValue, 2> Ops1;
28643	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28644	int M = TargetMask1[SrcIdx];
28645	if (isUndefOrZero(M)) {
28646	// Zero/UNDEF insertion - zero out element and remove dependency.
28647	InsertPSMask \|= (1u << DstIdx);
28648	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28649	DAG.getConstant(InsertPSMask, DL, MVT::i8));
28650	}
28651	// Update insertps mask srcidx and reference the source input directly.
28652	assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range" ) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28652, __PRETTY_FUNCTION__));
28653	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
28654	Op1 = Ops1[M < 4 ? 0 : 1];
28655	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28656	DAG.getConstant(InsertPSMask, DL, MVT::i8));
28657	}
28658
28659	// Attempt to merge insertps Op0 with an inner target shuffle node.
28660	SmallVector<int, 8> TargetMask0;
28661	SmallVector<SDValue, 2> Ops0;
28662	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28663	return SDValue();
28664
28665	bool Updated = false;
28666	bool UseInput00 = false;
28667	bool UseInput01 = false;
28668	for (int i = 0; i != 4; ++i) {
28669	int M = TargetMask0[i];
28670	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
28671	// No change if element is already zero or the inserted element.
28672	continue;
28673	} else if (isUndefOrZero(M)) {
28674	// If the target mask is undef/zero then we must zero the element.
28675	InsertPSMask \|= (1u << i);
28676	Updated = true;
28677	continue;
28678	}
28679
28680	// The input vector element must be inline.
28681	if (M != i && M != (i + 4))
28682	return SDValue();
28683
28684	// Determine which inputs of the target shuffle we're using.
28685	UseInput00 \|= (0 <= M && M < 4);
28686	UseInput01 \|= (4 <= M);
28687	}
28688
28689	// If we're not using both inputs of the target shuffle then use the
28690	// referenced input directly.
28691	if (UseInput00 && !UseInput01) {
28692	Updated = true;
28693	Op0 = Ops0[0];
28694	} else if (!UseInput00 && UseInput01) {
28695	Updated = true;
28696	Op0 = Ops0[1];
28697	}
28698
28699	if (Updated)
28700	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28701	DAG.getConstant(InsertPSMask, DL, MVT::i8));
28702
28703	return SDValue();
28704	}
28705	default:
28706	return SDValue();
28707	}
28708
28709	// Nuke no-op shuffles that show up after combining.
28710	if (isNoopShuffleMask(Mask))
28711	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);
28712
28713	// Look for simplifications involving one or two shuffle instructions.
28714	SDValue V = N.getOperand(0);
28715	switch (N.getOpcode()) {
28716	default:
28717	break;
28718	case X86ISD::PSHUFLW:
28719	case X86ISD::PSHUFHW:
28720	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!" ) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 28720, __PRETTY_FUNCTION__));
28721
28722	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28723	return SDValue(); // We combined away this shuffle, so we're done.
28724
28725	// See if this reduces to a PSHUFD which is no more expensive and can
28726	// combine with more operations. Note that it has to at least flip the
28727	// dwords as otherwise it would have been removed as a no-op.
28728	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28729	int DMask[] = {0, 1, 2, 3};
28730	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28731	DMask[DOffset + 0] = DOffset + 1;
28732	DMask[DOffset + 1] = DOffset + 0;
28733	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28734	V = DAG.getBitcast(DVT, V);
28735	DCI.AddToWorklist(V.getNode());
28736	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28737	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28738	DCI.AddToWorklist(V.getNode());
28739	return DAG.getBitcast(VT, V);
28740	}
28741
28742	// Look for shuffle patterns which can be implemented as a single unpack.
28743	// FIXME: This doesn't handle the location of the PSHUFD generically, and
28744	// only works when we have a PSHUFD followed by two half-shuffles.
28745	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28746	(V.getOpcode() == X86ISD::PSHUFLW \|\|
28747	V.getOpcode() == X86ISD::PSHUFHW) &&
28748	V.getOpcode() != N.getOpcode() &&
28749	V.hasOneUse()) {
28750	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28751	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28752	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28753	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28754	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28755	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28756	int WordMask[8];
28757	for (int i = 0; i < 4; ++i) {
28758	WordMask[i + NOffset] = Mask[i] + NOffset;
28759	WordMask[i + VOffset] = VMask[i] + VOffset;
28760	}
28761	// Map the word mask through the DWord mask.
28762	int MappedMask[8];
28763	for (int i = 0; i < 8; ++i)
28764	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28765	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
28766	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28767	// We can replace all three shuffles with an unpack.
28768	V = DAG.getBitcast(VT, D.getOperand(0));
28769	DCI.AddToWorklist(V.getNode());
28770	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28771	: X86ISD::UNPCKH,
28772	DL, VT, V, V);
28773	}
28774	}
28775	}
28776
28777	break;
28778
28779	case X86ISD::PSHUFD:
28780	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28781	return NewN;
28782
28783	break;
28784	}
28785
28786	return SDValue();
28787	}
28788
28789	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28790	/// operation. If true is returned then the operands of ADDSUB operation
28791	/// are written to the parameters \p Opnd0 and \p Opnd1.
28792	///
28793	/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28794	/// so it is easier to generically match. We also insert dummy vector shuffle
28795	/// nodes for the operands which explicitly discard the lanes which are unused
28796	/// by this operation to try to flow through the rest of the combiner
28797	/// the fact that they're unused.
28798	static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28799	SDValue &Opnd0, SDValue &Opnd1) {
28800
28801	EVT VT = N->getValueType(0);
28802	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28803	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28804	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
28805	return false;
28806
28807	// We only handle target-independent shuffles.
28808	// FIXME: It would be easy and harmless to use the target shuffle mask
28809	// extraction tool to support more.
28810	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28811	return false;
28812
28813	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28814	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28815
28816	SDValue V1 = N->getOperand(0);
28817	SDValue V2 = N->getOperand(1);
28818
28819	// We require the first shuffle operand to be the FSUB node, and the second to
28820	// be the FADD node.
28821	if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28822	ShuffleVectorSDNode::commuteMask(Mask);
28823	std::swap(V1, V2);
28824	} else if (V1.getOpcode() != ISD::FSUB \|\| V2.getOpcode() != ISD::FADD)
28825	return false;
28826
28827	// If there are other uses of these operations we can't fold them.
28828	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
28829	return false;
28830
28831	// Ensure that both operations have the same operands. Note that we can
28832	// commute the FADD operands.
28833	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28834	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
28835	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
28836	return false;
28837
28838	// We're looking for blends between FADD and FSUB nodes. We insist on these
28839	// nodes being lined up in a specific expected pattern.
28840	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
28841	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
28842	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
28843	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28844	8, 25, 10, 27, 12, 29, 14, 31})))
28845	return false;
28846
28847	Opnd0 = LHS;
28848	Opnd1 = RHS;
28849	return true;
28850	}
28851
28852	/// \brief Try to combine a shuffle into a target-specific add-sub or
28853	/// mul-add-sub node.
28854	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28855	const X86Subtarget &Subtarget,
28856	SelectionDAG &DAG) {
28857	SDValue Opnd0, Opnd1;
28858	if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28859	return SDValue();
28860
28861	EVT VT = N->getValueType(0);
28862	SDLoc DL(N);
28863
28864	// Try to generate X86ISD::FMADDSUB node here.
28865	SDValue Opnd2;
28866	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28867	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28868
28869	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
28870	// the ADDSUB idiom has been successfully recognized. There are no known
28871	// X86 targets with 512-bit ADDSUB instructions!
28872	if (VT.is512BitVector())
28873	return SDValue();
28874
28875	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28876	}
28877
28878	// We are looking for a shuffle where both sources are concatenated with undef
28879	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28880	// if we can express this as a single-source shuffle, that's preferable.
28881	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28882	const X86Subtarget &Subtarget) {
28883	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
28884	return SDValue();
28885
28886	EVT VT = N->getValueType(0);
28887
28888	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28889	if (!VT.is128BitVector() && !VT.is256BitVector())
28890	return SDValue();
28891
28892	if (VT.getVectorElementType() != MVT::i32 &&
28893	VT.getVectorElementType() != MVT::i64 &&
28894	VT.getVectorElementType() != MVT::f32 &&
28895	VT.getVectorElementType() != MVT::f64)
28896	return SDValue();
28897
28898	SDValue N0 = N->getOperand(0);
28899	SDValue N1 = N->getOperand(1);
28900
28901	// Check that both sources are concats with undef.
28902	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
28903	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
28904	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
28905	!N1.getOperand(1).isUndef())
28906	return SDValue();
28907
28908	// Construct the new shuffle mask. Elements from the first source retain their
28909	// index, but elements from the second source no longer need to skip an undef.
28910	SmallVector<int, 8> Mask;
28911	int NumElts = VT.getVectorNumElements();
28912
28913	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28914	for (int Elt : SVOp->getMask())
28915	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28916
28917	SDLoc DL(N);
28918	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28919	N1.getOperand(0));
28920	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28921	}
28922
28923	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28924	TargetLowering::DAGCombinerInfo &DCI,
28925	const X86Subtarget &Subtarget) {
28926	SDLoc dl(N);
28927	EVT VT = N->getValueType(0);
28928	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28929	// If we have legalized the vector types, look for blends of FADD and FSUB
28930	// nodes that we can fuse into an ADDSUB node.
28931	if (TLI.isTypeLegal(VT))
28932	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28933	return AddSub;
28934
28935	// During Type Legalization, when promoting illegal vector types,
28936	// the backend might introduce new shuffle dag nodes and bitcasts.
28937	//
28938	// This code performs the following transformation:
28939	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28940	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28941	//
28942	// We do this only if both the bitcast and the BINOP dag nodes have
28943	// one use. Also, perform this transformation only if the new binary
28944	// operation is legal. This is to avoid introducing dag nodes that
28945	// potentially need to be further expanded (or custom lowered) into a
28946	// less optimal sequence of dag nodes.
28947	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28948	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28949	N->getOperand(0).getOpcode() == ISD::BITCAST &&
28950	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28951	SDValue N0 = N->getOperand(0);
28952	SDValue N1 = N->getOperand(1);
28953
28954	SDValue BC0 = N0.getOperand(0);
28955	EVT SVT = BC0.getValueType();
28956	unsigned Opcode = BC0.getOpcode();
28957	unsigned NumElts = VT.getVectorNumElements();
28958
28959	if (BC0.hasOneUse() && SVT.isVector() &&
28960	SVT.getVectorNumElements() * 2 == NumElts &&
28961	TLI.isOperationLegal(Opcode, VT)) {
28962	bool CanFold = false;
28963	switch (Opcode) {
28964	default : break;
28965	case ISD::ADD:
28966	case ISD::SUB:
28967	case ISD::MUL:
28968	// isOperationLegal lies for integer ops on floating point types.
28969	CanFold = VT.isInteger();
28970	break;
28971	case ISD::FADD:
28972	case ISD::FSUB:
28973	case ISD::FMUL:
28974	// isOperationLegal lies for floating point ops on integer types.
28975	CanFold = VT.isFloatingPoint();
28976	break;
28977	}
28978
28979	unsigned SVTNumElts = SVT.getVectorNumElements();
28980	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28981	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28982	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28983	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28984	CanFold = SVOp->getMaskElt(i) < 0;
28985
28986	if (CanFold) {
28987	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28988	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28989	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28990	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28991	}
28992	}
28993	}
28994
28995	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28996	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28997	// consecutive, non-overlapping, and in the right order.
28998	SmallVector<SDValue, 16> Elts;
28999	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29000	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29001	Elts.push_back(Elt);
29002	continue;
29003	}
29004	Elts.clear();
29005	break;
29006	}
29007
29008	if (Elts.size() == VT.getVectorNumElements())
29009	if (SDValue LD =
29010	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29011	return LD;
29012
29013	// For AVX2, we sometimes want to combine
29014	// (vector_shuffle <mask> (concat_vectors t1, undef)
29015	// (concat_vectors t2, undef))
29016	// Into:
29017	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29018	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
29019	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29020	return ShufConcat;
29021
29022	if (isTargetShuffle(N->getOpcode())) {
29023	SDValue Op(N, 0);
29024	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29025	return Shuffle;
29026
29027	// Try recursively combining arbitrary sequences of x86 shuffle
29028	// instructions into higher-order shuffles. We do this after combining
29029	// specific PSHUF instruction sequences into their minimal form so that we
29030	// can evaluate how many specialized shuffle instructions are involved in
29031	// a particular chain.
29032	SmallVector<int, 1> NonceMask; // Just a placeholder.
29033	NonceMask.push_back(0);
29034	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29035	/Depth/ 1, /HasVarMask/ false, DAG,
29036	DCI, Subtarget))
29037	return SDValue(); // This routine will use CombineTo to replace N.
29038	}
29039
29040	return SDValue();
29041	}
29042
29043	/// Check if a vector extract from a target-specific shuffle of a load can be
29044	/// folded into a single element load.
29045	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29046	/// shuffles have been custom lowered so we need to handle those here.
29047	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29048	TargetLowering::DAGCombinerInfo &DCI) {
29049	if (DCI.isBeforeLegalizeOps())
29050	return SDValue();
29051
29052	SDValue InVec = N->getOperand(0);
29053	SDValue EltNo = N->getOperand(1);
29054	EVT EltVT = N->getValueType(0);
29055
29056	if (!isa<ConstantSDNode>(EltNo))
29057	return SDValue();
29058
29059	EVT OriginalVT = InVec.getValueType();
29060
29061	// Peek through bitcasts, don't duplicate a load with other uses.
29062	InVec = peekThroughOneUseBitcasts(InVec);
29063
29064	EVT CurrentVT = InVec.getValueType();
29065	if (!CurrentVT.isVector() \|\|
29066	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29067	return SDValue();
29068
29069	if (!isTargetShuffle(InVec.getOpcode()))
29070	return SDValue();
29071
29072	// Don't duplicate a load with other uses.
29073	if (!InVec.hasOneUse())
29074	return SDValue();
29075
29076	SmallVector<int, 16> ShuffleMask;
29077	SmallVector<SDValue, 2> ShuffleOps;
29078	bool UnaryShuffle;
29079	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29080	ShuffleOps, ShuffleMask, UnaryShuffle))
29081	return SDValue();
29082
29083	// Select the input vector, guarding against out of range extract vector.
29084	unsigned NumElems = CurrentVT.getVectorNumElements();
29085	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29086	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29087
29088	if (Idx == SM_SentinelZero)
29089	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29090	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29091	if (Idx == SM_SentinelUndef)
29092	return DAG.getUNDEF(EltVT);
29093
29094	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range")((0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range") ? static_cast<void> (0) : __assert_fail ("0 <= Idx && Idx < (int)(2 * NumElems) && \"Shuffle index out of range\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29094, __PRETTY_FUNCTION__));
29095	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29096	: ShuffleOps[1];
29097
29098	// If inputs to shuffle are the same for both ops, then allow 2 uses
29099	unsigned AllowedUses =
29100	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29101
29102	if (LdNode.getOpcode() == ISD::BITCAST) {
29103	// Don't duplicate a load with other uses.
29104	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29105	return SDValue();
29106
29107	AllowedUses = 1; // only allow 1 load use if we have a bitcast
29108	LdNode = LdNode.getOperand(0);
29109	}
29110
29111	if (!ISD::isNormalLoad(LdNode.getNode()))
29112	return SDValue();
29113
29114	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29115
29116	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
29117	return SDValue();
29118
29119	// If there's a bitcast before the shuffle, check if the load type and
29120	// alignment is valid.
29121	unsigned Align = LN0->getAlignment();
29122	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29123	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29124	EltVT.getTypeForEVT(*DAG.getContext()));
29125
29126	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29127	return SDValue();
29128
29129	// All checks match so transform back to vector_shuffle so that DAG combiner
29130	// can finish the job
29131	SDLoc dl(N);
29132
29133	// Create shuffle node taking into account the case that its a unary shuffle
29134	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29135	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29136	ShuffleMask);
29137	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29138	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29139	EltNo);
29140	}
29141
29142	// Try to match patterns such as
29143	// (i16 bitcast (v16i1 x))
29144	// ->
29145	// (i16 movmsk (16i8 sext (v16i1 x)))
29146	// before the illegal vector is scalarized on subtargets that don't have legal
29147	// vxi1 types.
29148	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29149	const X86Subtarget &Subtarget) {
29150	EVT VT = BitCast.getValueType();
29151	SDValue N0 = BitCast.getOperand(0);
29152	EVT VecVT = N0->getValueType(0);
29153
29154	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
29155	return SDValue();
29156
29157	// With AVX512 vxi1 types are legal and we prefer using k-regs.
29158	// MOVMSK is supported in SSE2 or later.
29159	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
29160	return SDValue();
29161
29162	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29163	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29164	// v8i16 and v16i16.
29165	// For these two cases, we can shuffle the upper element bytes to a
29166	// consecutive sequence at the start of the vector and treat the results as
29167	// v16i8 or v32i8, and for v61i8 this is the prefferable solution. However,
29168	// for v16i16 this is not the case, because the shuffle is expensive, so we
29169	// avoid sign-exteding to this type entirely.
29170	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29171	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29172	MVT SExtVT;
29173	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29174	switch (VecVT.getSimpleVT().SimpleTy) {
29175	default:
29176	return SDValue();
29177	case MVT::v2i1:
29178	SExtVT = MVT::v2i64;
29179	FPCastVT = MVT::v2f64;
29180	break;
29181	case MVT::v4i1:
29182	SExtVT = MVT::v4i32;
29183	FPCastVT = MVT::v4f32;
29184	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29185	// sign-extend to a 256-bit operation to avoid truncation.
29186	if (N0->getOpcode() == ISD::SETCC &&
29187	N0->getOperand(0)->getValueType(0).is256BitVector() &&
29188	Subtarget.hasInt256()) {
29189	SExtVT = MVT::v4i64;
29190	FPCastVT = MVT::v4f64;
29191	}
29192	break;
29193	case MVT::v8i1:
29194	SExtVT = MVT::v8i16;
29195	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29196	// sign-extend to a 256-bit operation to match the compare.
29197	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29198	// 256-bit because the shuffle is cheaper than sign extending the result of
29199	// the compare.
29200	if (N0->getOpcode() == ISD::SETCC &&
29201	N0->getOperand(0)->getValueType(0).is256BitVector() &&
29202	Subtarget.hasInt256()) {
29203	SExtVT = MVT::v8i32;
29204	FPCastVT = MVT::v8f32;
29205	}
29206	break;
29207	case MVT::v16i1:
29208	SExtVT = MVT::v16i8;
29209	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29210	// it is not profitable to sign-extend to 256-bit because this will
29211	// require an extra cross-lane shuffle which is more exprensive than
29212	// truncating the result of the compare to 128-bits.
29213	break;
29214	case MVT::v32i1:
29215	// TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29216	if (!Subtarget.hasInt256())
29217	return SDValue();
29218	SExtVT = MVT::v32i8;
29219	break;
29220	};
29221
29222	SDLoc DL(BitCast);
29223	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29224	if (SExtVT == MVT::v8i16) {
29225	V = DAG.getBitcast(MVT::v16i8, V);
29226	V = DAG.getVectorShuffle(
29227	MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29228	{0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29229	} else
29230	assert(SExtVT.getScalarType() != MVT::i16 &&((SExtVT.getScalarType() != MVT::i16 && "Vectors of i16 must be shuffled" ) ? static_cast<void> (0) : __assert_fail ("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be shuffled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29231, __PRETTY_FUNCTION__))
29231	"Vectors of i16 must be shuffled")((SExtVT.getScalarType() != MVT::i16 && "Vectors of i16 must be shuffled" ) ? static_cast<void> (0) : __assert_fail ("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be shuffled\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29231, __PRETTY_FUNCTION__));
29232	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29233	V = DAG.getBitcast(FPCastVT, V);
29234	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29235	return DAG.getZExtOrTrunc(V, DL, VT);
29236	}
29237
29238	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29239	TargetLowering::DAGCombinerInfo &DCI,
29240	const X86Subtarget &Subtarget) {
29241	SDValue N0 = N->getOperand(0);
29242	EVT VT = N->getValueType(0);
29243	EVT SrcVT = N0.getValueType();
29244
29245	// Try to match patterns such as
29246	// (i16 bitcast (v16i1 x))
29247	// ->
29248	// (i16 movmsk (16i8 sext (v16i1 x)))
29249	// before the setcc result is scalarized on subtargets that don't have legal
29250	// vxi1 types.
29251	if (DCI.isBeforeLegalize())
29252	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29253	return V;
29254	// Since MMX types are special and don't usually play with other vector types,
29255	// it's better to handle them early to be sure we emit efficient code by
29256	// avoiding store-load conversions.
29257
29258	// Detect bitcasts between i32 to x86mmx low word.
29259	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29260	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29261	SDValue N00 = N0->getOperand(0);
29262	if (N00.getValueType() == MVT::i32)
29263	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29264	}
29265
29266	// Detect bitcasts between element or subvector extraction to x86mmx.
29267	if (VT == MVT::x86mmx &&
29268	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
29269	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29270	isNullConstant(N0.getOperand(1))) {
29271	SDValue N00 = N0->getOperand(0);
29272	if (N00.getValueType().is128BitVector())
29273	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29274	DAG.getBitcast(MVT::v2i64, N00));
29275	}
29276
29277	// Detect bitcasts from FP_TO_SINT to x86mmx.
29278	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29279	N0.getOpcode() == ISD::FP_TO_SINT) {
29280	SDLoc DL(N0);
29281	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29282	DAG.getUNDEF(MVT::v2i32));
29283	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29284	DAG.getBitcast(MVT::v2i64, Res));
29285	}
29286
29287	// Convert a bitcasted integer logic operation that has one bitcasted
29288	// floating-point operand into a floating-point logic operation. This may
29289	// create a load of a constant, but that is cheaper than materializing the
29290	// constant in an integer register and transferring it to an SSE register or
29291	// transferring the SSE operand to integer register and back.
29292	unsigned FPOpcode;
29293	switch (N0.getOpcode()) {
29294	case ISD::AND: FPOpcode = X86ISD::FAND; break;
29295	case ISD::OR: FPOpcode = X86ISD::FOR; break;
29296	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29297	default: return SDValue();
29298	}
29299
29300	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
29301	(Subtarget.hasSSE2() && VT == MVT::f64)))
29302	return SDValue();
29303
29304	SDValue LogicOp0 = N0.getOperand(0);
29305	SDValue LogicOp1 = N0.getOperand(1);
29306	SDLoc DL0(N0);
29307
29308	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29309	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29310	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29311	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29312	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29313	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29314	}
29315	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29316	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29317	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29318	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29319	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29320	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29321	}
29322
29323	return SDValue();
29324	}
29325
29326	// Match a binop + shuffle pyramid that represents a horizontal reduction over
29327	// the elements of a vector.
29328	// Returns the vector that is being reduced on, or SDValue() if a reduction
29329	// was not matched.
29330	static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29331	// The pattern must end in an extract from index 0.
29332	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
29333	!isNullConstant(Extract->getOperand(1)))
29334	return SDValue();
29335
29336	unsigned Stages =
29337	Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29338
29339	SDValue Op = Extract->getOperand(0);
29340	// At each stage, we're looking for something that looks like:
29341	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29342	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29343	// i32 undef, i32 undef, i32 undef, i32 undef>
29344	// %a = binop <8 x i32> %op, %s
29345	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29346	// we expect something like:
29347	// <4,5,6,7,u,u,u,u>
29348	// <2,3,u,u,u,u,u,u>
29349	// <1,u,u,u,u,u,u,u>
29350	for (unsigned i = 0; i < Stages; ++i) {
29351	if (Op.getOpcode() != BinOp)
29352	return SDValue();
29353
29354	ShuffleVectorSDNode *Shuffle =
29355	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29356	if (Shuffle) {
29357	Op = Op.getOperand(1);
29358	} else {
29359	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29360	Op = Op.getOperand(0);
29361	}
29362
29363	// The first operand of the shuffle should be the same as the other operand
29364	// of the add.
29365	if (!Shuffle \|\| (Shuffle->getOperand(0) != Op))
29366	return SDValue();
29367
29368	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
29369	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29370	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29371	return SDValue();
29372	}
29373
29374	return Op;
29375	}
29376
29377	// Given a select, detect the following pattern:
29378	// 1: %2 = zext <N x i8> %0 to <N x i32>
29379	// 2: %3 = zext <N x i8> %1 to <N x i32>
29380	// 3: %4 = sub nsw <N x i32> %2, %3
29381	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29382	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29383	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29384	// This is useful as it is the input into a SAD pattern.
29385	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29386	SDValue &Op1) {
29387	// Check the condition of the select instruction is greater-than.
29388	SDValue SetCC = Select->getOperand(0);
29389	if (SetCC.getOpcode() != ISD::SETCC)
29390	return false;
29391	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29392	if (CC != ISD::SETGT && CC != ISD::SETLT)
29393	return false;
29394
29395	SDValue SelectOp1 = Select->getOperand(1);
29396	SDValue SelectOp2 = Select->getOperand(2);
29397
29398	// The following instructions assume SelectOp1 is the subtraction operand
29399	// and SelectOp2 is the negation operand.
29400	// In the case of SETLT this is the other way around.
29401	if (CC == ISD::SETLT)
29402	std::swap(SelectOp1, SelectOp2);
29403
29404	// The second operand of the select should be the negation of the first
29405	// operand, which is implemented as 0 - SelectOp1.
29406	if (!(SelectOp2.getOpcode() == ISD::SUB &&
29407	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29408	SelectOp2.getOperand(1) == SelectOp1))
29409	return false;
29410
29411	// The first operand of SetCC is the first operand of the select, which is the
29412	// difference between the two input vectors.
29413	if (SetCC.getOperand(0) != SelectOp1)
29414	return false;
29415
29416	// In SetLT case, The second operand of the comparison can be either 1 or 0.
29417	APInt SplatVal;
29418	if ((CC == ISD::SETLT) &&
29419	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29420	SplatVal == 1) \|\|
29421	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29422	return false;
29423
29424	// In SetGT case, The second operand of the comparison can be either -1 or 0.
29425	if ((CC == ISD::SETGT) &&
29426	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
29427	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29428	return false;
29429
29430	// The first operand of the select is the difference between the two input
29431	// vectors.
29432	if (SelectOp1.getOpcode() != ISD::SUB)
29433	return false;
29434
29435	Op0 = SelectOp1.getOperand(0);
29436	Op1 = SelectOp1.getOperand(1);
29437
29438	// Check if the operands of the sub are zero-extended from vectors of i8.
29439	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
29440	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
29441	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
29442	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29443	return false;
29444
29445	return true;
29446	}
29447
29448	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29449	// to these zexts.
29450	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29451	const SDValue &Zext1, const SDLoc &DL) {
29452
29453	// Find the appropriate width for the PSADBW.
29454	EVT InVT = Zext0.getOperand(0).getValueType();
29455	unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29456
29457	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29458	// fill in the missing vector elements with 0.
29459	unsigned NumConcat = RegSize / InVT.getSizeInBits();
29460	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29461	Ops[0] = Zext0.getOperand(0);
29462	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29463	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29464	Ops[0] = Zext1.getOperand(0);
29465	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29466
29467	// Actually build the SAD
29468	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29469	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29470	}
29471
29472	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29473	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29474	SelectionDAG &DAG,
29475	const X86Subtarget &Subtarget) {
29476	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
29477	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
29478	return SDValue();
29479
29480	EVT ExtractVT = Extract->getValueType(0);
29481	unsigned BitWidth = ExtractVT.getSizeInBits();
29482	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29483	ExtractVT != MVT::i8)
29484	return SDValue();
29485
29486	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29487	for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29488	SDValue Match = matchBinOpReduction(Extract, Op);
29489	if (!Match)
29490	continue;
29491
29492	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29493	// which we can't support here for now.
29494	if (Match.getScalarValueSizeInBits() != BitWidth)
29495	continue;
29496
29497	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
29498	unsigned MatchSizeInBits = Match.getValueSizeInBits();
29499	if (!(MatchSizeInBits == 128 \|\|
29500	(MatchSizeInBits == 256 &&
29501	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
29502	return SDValue();
29503
29504	// Don't bother performing this for 2-element vectors.
29505	if (Match.getValueType().getVectorNumElements() <= 2)
29506	return SDValue();
29507
29508	// Check that we are extracting a reduction of all sign bits.
29509	if (DAG.ComputeNumSignBits(Match) != BitWidth)
29510	return SDValue();
29511
29512	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29513	MVT MaskVT;
29514	if (64 == BitWidth \|\| 32 == BitWidth)
29515	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29516	MatchSizeInBits / BitWidth);
29517	else
29518	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29519
29520	APInt CompareBits;
29521	ISD::CondCode CondCode;
29522	if (Op == ISD::OR) {
29523	// any_of -> MOVMSK != 0
29524	CompareBits = APInt::getNullValue(32);
29525	CondCode = ISD::CondCode::SETNE;
29526	} else {
29527	// all_of -> MOVMSK == ((1 << NumElts) - 1)
29528	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29529	CondCode = ISD::CondCode::SETEQ;
29530	}
29531
29532	// Perform the select as i32/i64 and then truncate to avoid partial register
29533	// stalls.
29534	unsigned ResWidth = std::max(BitWidth, 32u);
29535	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29536	SDLoc DL(Extract);
29537	SDValue Zero = DAG.getConstant(0, DL, ResVT);
29538	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29539	SDValue Res = DAG.getBitcast(MaskVT, Match);
29540	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29541	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29542	Ones, Zero, CondCode);
29543	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29544	}
29545
29546	return SDValue();
29547	}
29548
29549	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29550	const X86Subtarget &Subtarget) {
29551	// PSADBW is only supported on SSE2 and up.
29552	if (!Subtarget.hasSSE2())
29553	return SDValue();
29554
29555	// Verify the type we're extracting from is any integer type above i16.
29556	EVT VT = Extract->getOperand(0).getValueType();
29557	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
29558	return SDValue();
29559
29560	unsigned RegSize = 128;
29561	if (Subtarget.hasBWI())
29562	RegSize = 512;
29563	else if (Subtarget.hasAVX2())
29564	RegSize = 256;
29565
29566	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29567	// TODO: We should be able to handle larger vectors by splitting them before
29568	// feeding them into several SADs, and then reducing over those.
29569	if (RegSize / VT.getVectorNumElements() < 8)
29570	return SDValue();
29571
29572	// Match shuffle + add pyramid.
29573	SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29574
29575	// The operand is expected to be zero extended from i8
29576	// (verified in detectZextAbsDiff).
29577	// In order to convert to i64 and above, additional any/zero/sign
29578	// extend is expected.
29579	// The zero extend from 32 bit has no mathematical effect on the result.
29580	// Also the sign extend is basically zero extend
29581	// (extends the sign bit which is zero).
29582	// So it is correct to skip the sign/zero extend instruction.
29583	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
29584	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
29585	Root.getOpcode() == ISD::ANY_EXTEND))
29586	Root = Root.getOperand(0);
29587
29588	// If there was a match, we want Root to be a select that is the root of an
29589	// abs-diff pattern.
29590	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
29591	return SDValue();
29592
29593	// Check whether we have an abs-diff pattern feeding into the select.
29594	SDValue Zext0, Zext1;
29595	if (!detectZextAbsDiff(Root, Zext0, Zext1))
29596	return SDValue();
29597
29598	// Create the SAD instruction.
29599	SDLoc DL(Extract);
29600	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29601
29602	// If the original vector was wider than 8 elements, sum over the results
29603	// in the SAD vector.
29604	unsigned Stages = Log2_32(VT.getVectorNumElements());
29605	MVT SadVT = SAD.getSimpleValueType();
29606	if (Stages > 3) {
29607	unsigned SadElems = SadVT.getVectorNumElements();
29608
29609	for(unsigned i = Stages - 3; i > 0; --i) {
29610	SmallVector<int, 16> Mask(SadElems, -1);
29611	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29612	Mask[j] = MaskEnd + j;
29613
29614	SDValue Shuffle =
29615	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29616	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29617	}
29618	}
29619
29620	MVT Type = Extract->getSimpleValueType(0);
29621	unsigned TypeSizeInBits = Type.getSizeInBits();
29622	// Return the lowest TypeSizeInBits bits.
29623	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29624	SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29625	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29626	Extract->getOperand(1));
29627	}
29628
29629	// Attempt to peek through a target shuffle and extract the scalar from the
29630	// source.
29631	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29632	TargetLowering::DAGCombinerInfo &DCI,
29633	const X86Subtarget &Subtarget) {
29634	if (DCI.isBeforeLegalizeOps())
29635	return SDValue();
29636
29637	SDValue Src = N->getOperand(0);
29638	SDValue Idx = N->getOperand(1);
29639
29640	EVT VT = N->getValueType(0);
29641	EVT SrcVT = Src.getValueType();
29642	EVT SrcSVT = SrcVT.getVectorElementType();
29643	unsigned NumSrcElts = SrcVT.getVectorNumElements();
29644
29645	// Don't attempt this for boolean mask vectors or unknown extraction indices.
29646	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
29647	return SDValue();
29648
29649	// Resolve the target shuffle inputs and mask.
29650	SmallVector<int, 16> Mask;
29651	SmallVector<SDValue, 2> Ops;
29652	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29653	return SDValue();
29654
29655	// Attempt to narrow/widen the shuffle mask to the correct size.
29656	if (Mask.size() != NumSrcElts) {
29657	if ((NumSrcElts % Mask.size()) == 0) {
29658	SmallVector<int, 16> ScaledMask;
29659	int Scale = NumSrcElts / Mask.size();
29660	scaleShuffleMask(Scale, Mask, ScaledMask);
29661	Mask = std::move(ScaledMask);
29662	} else if ((Mask.size() % NumSrcElts) == 0) {
29663	SmallVector<int, 16> WidenedMask;
29664	while (Mask.size() > NumSrcElts &&
29665	canWidenShuffleElements(Mask, WidenedMask))
29666	Mask = std::move(WidenedMask);
29667	// TODO - investigate support for wider shuffle masks with known upper
29668	// undef/zero elements for implicit zero-extension.
29669	}
29670	}
29671
29672	// Check if narrowing/widening failed.
29673	if (Mask.size() != NumSrcElts)
29674	return SDValue();
29675
29676	int SrcIdx = Mask[N->getConstantOperandVal(1)];
29677	SDLoc dl(N);
29678
29679	// If the shuffle source element is undef/zero then we can just accept it.
29680	if (SrcIdx == SM_SentinelUndef)
29681	return DAG.getUNDEF(VT);
29682
29683	if (SrcIdx == SM_SentinelZero)
29684	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29685	: DAG.getConstant(0, dl, VT);
29686
29687	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29688	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29689	SrcIdx = SrcIdx % Mask.size();
29690
29691	// We can only extract other elements from 128-bit vectors and in certain
29692	// circumstances, depending on SSE-level.
29693	// TODO: Investigate using extract_subvector for larger vectors.
29694	// TODO: Investigate float/double extraction if it will be just stored.
29695	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
29696	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
29697	assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast <void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29697, __PRETTY_FUNCTION__));
29698	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29699	DAG.getIntPtrConstant(SrcIdx, dl));
29700	}
29701
29702	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
29703	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29704	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type" ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29705, __PRETTY_FUNCTION__))
29705	"Unexpected extraction type")((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type" ) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29705, __PRETTY_FUNCTION__));
29706	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29707	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29708	DAG.getIntPtrConstant(SrcIdx, dl));
29709	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29710	DAG.getValueType(SrcSVT));
29711	return DAG.getZExtOrTrunc(Assert, dl, VT);
29712	}
29713
29714	return SDValue();
29715	}
29716
29717	/// Detect vector gather/scatter index generation and convert it from being a
29718	/// bunch of shuffles and extracts into a somewhat faster sequence.
29719	/// For i686, the best sequence is apparently storing the value and loading
29720	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
29721	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29722	TargetLowering::DAGCombinerInfo &DCI,
29723	const X86Subtarget &Subtarget) {
29724	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29725	return NewOp;
29726
29727	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29728	return NewOp;
29729
29730	SDValue InputVector = N->getOperand(0);
29731	SDValue EltIdx = N->getOperand(1);
29732
29733	EVT SrcVT = InputVector.getValueType();
29734	EVT VT = N->getValueType(0);
29735	SDLoc dl(InputVector);
29736
29737	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29738	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29739	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29740	SDValue MMXSrc = InputVector.getOperand(0);
29741
29742	// The bitcast source is a direct mmx result.
29743	if (MMXSrc.getValueType() == MVT::x86mmx)
29744	return DAG.getBitcast(VT, InputVector);
29745	}
29746
29747	// Detect mmx to i32 conversion through a v2i32 elt extract.
29748	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29749	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29750	SDValue MMXSrc = InputVector.getOperand(0);
29751
29752	// The bitcast source is a direct mmx result.
29753	if (MMXSrc.getValueType() == MVT::x86mmx)
29754	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29755	}
29756
29757	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29758	isa<ConstantSDNode>(EltIdx) &&
29759	isa<ConstantSDNode>(InputVector.getOperand(0))) {
29760	uint64_t ExtractedElt = N->getConstantOperandVal(1);
29761	uint64_t InputValue = InputVector.getConstantOperandVal(0);
29762	uint64_t Res = (InputValue >> ExtractedElt) & 1;
29763	return DAG.getConstant(Res, dl, MVT::i1);
29764	}
29765
29766	// Check whether this extract is the root of a sum of absolute differences
29767	// pattern. This has to be done here because we really want it to happen
29768	// pre-legalization,
29769	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29770	return SAD;
29771
29772	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29773	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29774	return Cmp;
29775
29776	// Only operate on vectors of 4 elements, where the alternative shuffling
29777	// gets to be more expensive.
29778	if (SrcVT != MVT::v4i32)
29779	return SDValue();
29780
29781	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29782	// single use which is a sign-extend or zero-extend, and all elements are
29783	// used.
29784	SmallVector<SDNode *, 4> Uses;
29785	unsigned ExtractedElements = 0;
29786	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29787	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29788	if (UI.getUse().getResNo() != InputVector.getResNo())
29789	return SDValue();
29790
29791	SDNode Extract = UI;
29792	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29793	return SDValue();
29794
29795	if (Extract->getValueType(0) != MVT::i32)
29796	return SDValue();
29797	if (!Extract->hasOneUse())
29798	return SDValue();
29799	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29800	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29801	return SDValue();
29802	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29803	return SDValue();
29804
29805	// Record which element was extracted.
29806	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
29807	Uses.push_back(Extract);
29808	}
29809
29810	// If not all the elements were used, this may not be worthwhile.
29811	if (ExtractedElements != 15)
29812	return SDValue();
29813
29814	// Ok, we've now decided to do the transformation.
29815	// If 64-bit shifts are legal, use the extract-shift sequence,
29816	// otherwise bounce the vector off the cache.
29817	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29818	SDValue Vals[4];
29819
29820	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29821	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29822	auto &DL = DAG.getDataLayout();
29823	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29824	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29825	DAG.getConstant(0, dl, VecIdxTy));
29826	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29827	DAG.getConstant(1, dl, VecIdxTy));
29828
29829	SDValue ShAmt = DAG.getConstant(
29830	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29831	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29832	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29833	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29834	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29835	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29836	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29837	} else {
29838	// Store the value to a temporary stack slot.
29839	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29840	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29841	MachinePointerInfo());
29842
29843	EVT ElementType = SrcVT.getVectorElementType();
29844	unsigned EltSize = ElementType.getSizeInBits() / 8;
29845
29846	// Replace each use (extract) with a load of the appropriate element.
29847	for (unsigned i = 0; i < 4; ++i) {
29848	uint64_t Offset = EltSize * i;
29849	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29850	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29851
29852	SDValue ScalarAddr =
29853	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29854
29855	// Load the scalar.
29856	Vals[i] =
29857	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29858	}
29859	}
29860
29861	// Replace the extracts
29862	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29863	UE = Uses.end(); UI != UE; ++UI) {
29864	SDNode Extract = UI;
29865
29866	uint64_t IdxVal = Extract->getConstantOperandVal(1);
29867	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29868	}
29869
29870	// The replacement was made in place; don't return anything.
29871	return SDValue();
29872	}
29873
29874	// TODO - merge with combineExtractVectorElt once it can handle the implicit
29875	// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29876	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29877	// combineBasicSADPattern.
29878	static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29879	TargetLowering::DAGCombinerInfo &DCI,
29880	const X86Subtarget &Subtarget) {
29881	return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29882	}
29883
29884	/// If a vector select has an operand that is -1 or 0, try to simplify the
29885	/// select to a bitwise logic operation.
29886	static SDValue
29887	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29888	TargetLowering::DAGCombinerInfo &DCI,
29889	const X86Subtarget &Subtarget) {
29890	SDValue Cond = N->getOperand(0);
29891	SDValue LHS = N->getOperand(1);
29892	SDValue RHS = N->getOperand(2);
29893	EVT VT = LHS.getValueType();
29894	EVT CondVT = Cond.getValueType();
29895	SDLoc DL(N);
29896	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29897
29898	if (N->getOpcode() != ISD::VSELECT)
29899	return SDValue();
29900
29901	assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!" ) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 29901, __PRETTY_FUNCTION__));
29902
29903	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29904	// Check if the first operand is all zeros and Cond type is vXi1.
29905	// This situation only applies to avx512.
29906	if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29907	CondVT.getVectorElementType() == MVT::i1) {
29908	// Invert the cond to not(cond) : xor(op,allones)=not(op)
29909	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
29910	DAG.getAllOnesConstant(DL, CondVT));
29911	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29912	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
29913	}
29914
29915	// To use the condition operand as a bitwise mask, it must have elements that
29916	// are the same size as the select elements. Ie, the condition operand must
29917	// have already been promoted from the IR select condition type <N x i1>.
29918	// Don't check if the types themselves are equal because that excludes
29919	// vector floating-point selects.
29920	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29921	return SDValue();
29922
29923	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29924	FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29925
29926	// Try to invert the condition if true value is not all 1s and false value is
29927	// not all 0s.
29928	if (!TValIsAllOnes && !FValIsAllZeros &&
29929	// Check if the selector will be produced by CMPP/PCMP.
29930	Cond.getOpcode() == ISD::SETCC &&
29931	// Check if SETCC has already been promoted.
29932	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29933	CondVT) {
29934	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29935	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29936
29937	if (TValIsAllZeros \|\| FValIsAllOnes) {
29938	SDValue CC = Cond.getOperand(2);
29939	ISD::CondCode NewCC =
29940	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29941	Cond.getOperand(0).getValueType().isInteger());
29942	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29943	NewCC);
29944	std::swap(LHS, RHS);
29945	TValIsAllOnes = FValIsAllOnes;
29946	FValIsAllZeros = TValIsAllZeros;
29947	}
29948	}
29949
29950	// vselect Cond, 111..., 000... -> Cond
29951	if (TValIsAllOnes && FValIsAllZeros)
29952	return DAG.getBitcast(VT, Cond);
29953
29954	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29955	return SDValue();
29956
29957	// vselect Cond, 111..., X -> or Cond, X
29958	if (TValIsAllOnes) {
29959	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29960	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29961	return DAG.getBitcast(VT, Or);
29962	}
29963
29964	// vselect Cond, X, 000... -> and Cond, X
29965	if (FValIsAllZeros) {
29966	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29967	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29968	return DAG.getBitcast(VT, And);
29969	}
29970
29971	return SDValue();
29972	}
29973
29974	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29975	SDValue Cond = N->getOperand(0);
29976	SDValue LHS = N->getOperand(1);
29977	SDValue RHS = N->getOperand(2);
29978	SDLoc DL(N);
29979
29980	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29981	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29982	if (!TrueC \|\| !FalseC)
29983	return SDValue();
29984
29985	// Don't do this for crazy integer types.
29986	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29987	return SDValue();
29988
29989	// If this is efficiently invertible, canonicalize the LHSC/RHSC values
29990	// so that TrueC (the true value) is larger than FalseC.
29991	bool NeedsCondInvert = false;
29992	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29993	// Efficiently invertible.
29994	(Cond.getOpcode() == ISD::SETCC \|\| // setcc -> invertible.
29995	(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29996	isa<ConstantSDNode>(Cond.getOperand(1))))) {
29997	NeedsCondInvert = true;
29998	std::swap(TrueC, FalseC);
29999	}
30000
30001	// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
30002	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30003	if (NeedsCondInvert) // Invert the condition if needed.
30004	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30005	DAG.getConstant(1, DL, Cond.getValueType()));
30006
30007	// Zero extend the condition if needed.
30008	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30009
30010	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30011	return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30012	DAG.getConstant(ShAmt, DL, MVT::i8));
30013	}
30014
30015	// Optimize cases that will turn into an LEA instruction. This requires
30016	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30017	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
30018	uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30019	if (N->getValueType(0) == MVT::i32)
30020	Diff = (unsigned)Diff;
30021
30022	bool IsFastMultiplier = false;
30023	if (Diff < 10) {
30024	switch ((unsigned char)Diff) {
30025	default:
30026	break;
30027	case 1: // result = add base, cond
30028	case 2: // result = lea base( , cond*2)
30029	case 3: // result = lea base(cond, cond*2)
30030	case 4: // result = lea base( , cond*4)
30031	case 5: // result = lea base(cond, cond*4)
30032	case 8: // result = lea base( , cond*8)
30033	case 9: // result = lea base(cond, cond*8)
30034	IsFastMultiplier = true;
30035	break;
30036	}
30037	}
30038
30039	if (IsFastMultiplier) {
30040	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30041	if (NeedsCondInvert) // Invert the condition if needed.
30042	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30043	DAG.getConstant(1, DL, Cond.getValueType()));
30044
30045	// Zero extend the condition if needed.
30046	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30047	// Scale the condition by the difference.
30048	if (Diff != 1)
30049	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30050	DAG.getConstant(Diff, DL, Cond.getValueType()));
30051
30052	// Add the base if non-zero.
30053	if (FalseC->getAPIntValue() != 0)
30054	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30055	SDValue(FalseC, 0));
30056	return Cond;
30057	}
30058	}
30059
30060	return SDValue();
30061	}
30062
30063	// If this is a bitcasted op that can be represented as another type, push the
30064	// the bitcast to the inputs. This allows more opportunities for pattern
30065	// matching masked instructions. This is called when we know that the operation
30066	// is used as one of the inputs of a vselect.
30067	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30068	TargetLowering::DAGCombinerInfo &DCI) {
30069	// Make sure we have a bitcast.
30070	if (OrigOp.getOpcode() != ISD::BITCAST)
30071	return false;
30072
30073	SDValue Op = OrigOp.getOperand(0);
30074
30075	// If the operation is used by anything other than the bitcast, we shouldn't
30076	// do this combine as that would replicate the operation.
30077	if (!Op.hasOneUse())
30078	return false;
30079
30080	MVT VT = OrigOp.getSimpleValueType();
30081	MVT EltVT = VT.getVectorElementType();
30082	SDLoc DL(Op.getNode());
30083
30084	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30085	SDValue Op2) {
30086	Op0 = DAG.getBitcast(VT, Op0);
30087	DCI.AddToWorklist(Op0.getNode());
30088	Op1 = DAG.getBitcast(VT, Op1);
30089	DCI.AddToWorklist(Op1.getNode());
30090	DCI.CombineTo(OrigOp.getNode(),
30091	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30092	return true;
30093	};
30094
30095	unsigned Opcode = Op.getOpcode();
30096	switch (Opcode) {
30097	case X86ISD::PALIGNR:
30098	// PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30099	if (!VT.is128BitVector())
30100	return false;
30101	Opcode = X86ISD::VALIGN;
30102	LLVM_FALLTHROUGH[[clang::fallthrough]];
30103	case X86ISD::VALIGN: {
30104	if (EltVT != MVT::i32 && EltVT != MVT::i64)
30105	return false;
30106	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30107	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30108	unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30109	unsigned EltSize = EltVT.getSizeInBits();
30110	// Make sure we can represent the same shift with the new VT.
30111	if ((ShiftAmt % EltSize) != 0)
30112	return false;
30113	Imm = ShiftAmt / EltSize;
30114	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30115	DAG.getConstant(Imm, DL, MVT::i8));
30116	}
30117	case X86ISD::SHUF128: {
30118	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30119	return false;
30120	// Only change element size, not type.
30121	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30122	return false;
30123	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30124	Op.getOperand(2));
30125	}
30126	case ISD::INSERT_SUBVECTOR: {
30127	unsigned EltSize = EltVT.getSizeInBits();
30128	if (EltSize != 32 && EltSize != 64)
30129	return false;
30130	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30131	// Only change element size, not type.
30132	if (EltVT.isInteger() != OpEltVT.isInteger())
30133	return false;
30134	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30135	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30136	SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30137	DCI.AddToWorklist(Op0.getNode());
30138	// Op1 needs to be bitcasted to a smaller vector with the same element type.
30139	SDValue Op1 = Op.getOperand(1);
30140	MVT Op1VT = MVT::getVectorVT(EltVT,
30141	Op1.getSimpleValueType().getSizeInBits() / EltSize);
30142	Op1 = DAG.getBitcast(Op1VT, Op1);
30143	DCI.AddToWorklist(Op1.getNode());
30144	DCI.CombineTo(OrigOp.getNode(),
30145	DAG.getNode(Opcode, DL, VT, Op0, Op1,
30146	DAG.getIntPtrConstant(Imm, DL)));
30147	return true;
30148	}
30149	case ISD::EXTRACT_SUBVECTOR: {
30150	unsigned EltSize = EltVT.getSizeInBits();
30151	if (EltSize != 32 && EltSize != 64)
30152	return false;
30153	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30154	// Only change element size, not type.
30155	if (EltVT.isInteger() != OpEltVT.isInteger())
30156	return false;
30157	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30158	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30159	// Op0 needs to be bitcasted to a larger vector with the same element type.
30160	SDValue Op0 = Op.getOperand(0);
30161	MVT Op0VT = MVT::getVectorVT(EltVT,
30162	Op0.getSimpleValueType().getSizeInBits() / EltSize);
30163	Op0 = DAG.getBitcast(Op0VT, Op0);
30164	DCI.AddToWorklist(Op0.getNode());
30165	DCI.CombineTo(OrigOp.getNode(),
30166	DAG.getNode(Opcode, DL, VT, Op0,
30167	DAG.getIntPtrConstant(Imm, DL)));
30168	return true;
30169	}
30170	case X86ISD::SUBV_BROADCAST: {
30171	unsigned EltSize = EltVT.getSizeInBits();
30172	if (EltSize != 32 && EltSize != 64)
30173	return false;
30174	// Only change element size, not type.
30175	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30176	return false;
30177	SDValue Op0 = Op.getOperand(0);
30178	MVT Op0VT = MVT::getVectorVT(EltVT,
30179	Op0.getSimpleValueType().getSizeInBits() / EltSize);
30180	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30181	DCI.AddToWorklist(Op0.getNode());
30182	DCI.CombineTo(OrigOp.getNode(),
30183	DAG.getNode(Opcode, DL, VT, Op0));
30184	return true;
30185	}
30186	}
30187
30188	return false;
30189	}
30190
30191	/// Do target-specific dag combines on SELECT and VSELECT nodes.
30192	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30193	TargetLowering::DAGCombinerInfo &DCI,
30194	const X86Subtarget &Subtarget) {
30195	SDLoc DL(N);
30196	SDValue Cond = N->getOperand(0);
30197	// Get the LHS/RHS of the select.
30198	SDValue LHS = N->getOperand(1);
30199	SDValue RHS = N->getOperand(2);
30200	EVT VT = LHS.getValueType();
30201	EVT CondVT = Cond.getValueType();
30202	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30203
30204	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
30205	// instructions match the semantics of the common C idiom x<y?x:y but not
30206	// x<=y?x:y, because of how they handle negative zero (which can be
30207	// ignored in unsafe-math mode).
30208	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30209	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30210	VT != MVT::f80 && VT != MVT::f128 &&
30211	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
30212	(Subtarget.hasSSE2() \|\|
30213	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30214	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30215
30216	unsigned Opcode = 0;
30217	// Check for x CC y ? x : y.
30218	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30219	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30220	switch (CC) {
30221	default: break;
30222	case ISD::SETULT:
30223	// Converting this to a min would handle NaNs incorrectly, and swapping
30224	// the operands would cause it to handle comparisons between positive
30225	// and negative zero incorrectly.
30226	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
30227	if (!DAG.getTarget().Options.UnsafeFPMath &&
30228	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
30229	break;
30230	std::swap(LHS, RHS);
30231	}
30232	Opcode = X86ISD::FMIN;
30233	break;
30234	case ISD::SETOLE:
30235	// Converting this to a min would handle comparisons between positive
30236	// and negative zero incorrectly.
30237	if (!DAG.getTarget().Options.UnsafeFPMath &&
30238	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30239	break;
30240	Opcode = X86ISD::FMIN;
30241	break;
30242	case ISD::SETULE:
30243	// Converting this to a min would handle both negative zeros and NaNs
30244	// incorrectly, but we can swap the operands to fix both.
30245	std::swap(LHS, RHS);
30246	LLVM_FALLTHROUGH[[clang::fallthrough]];
30247	case ISD::SETOLT:
30248	case ISD::SETLT:
30249	case ISD::SETLE:
30250	Opcode = X86ISD::FMIN;
30251	break;
30252
30253	case ISD::SETOGE:
30254	// Converting this to a max would handle comparisons between positive
30255	// and negative zero incorrectly.
30256	if (!DAG.getTarget().Options.UnsafeFPMath &&
30257	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30258	break;
30259	Opcode = X86ISD::FMAX;
30260	break;
30261	case ISD::SETUGT:
30262	// Converting this to a max would handle NaNs incorrectly, and swapping
30263	// the operands would cause it to handle comparisons between positive
30264	// and negative zero incorrectly.
30265	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
30266	if (!DAG.getTarget().Options.UnsafeFPMath &&
30267	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
30268	break;
30269	std::swap(LHS, RHS);
30270	}
30271	Opcode = X86ISD::FMAX;
30272	break;
30273	case ISD::SETUGE:
30274	// Converting this to a max would handle both negative zeros and NaNs
30275	// incorrectly, but we can swap the operands to fix both.
30276	std::swap(LHS, RHS);
30277	LLVM_FALLTHROUGH[[clang::fallthrough]];
30278	case ISD::SETOGT:
30279	case ISD::SETGT:
30280	case ISD::SETGE:
30281	Opcode = X86ISD::FMAX;
30282	break;
30283	}
30284	// Check for x CC y ? y : x -- a min/max with reversed arms.
30285	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30286	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30287	switch (CC) {
30288	default: break;
30289	case ISD::SETOGE:
30290	// Converting this to a min would handle comparisons between positive
30291	// and negative zero incorrectly, and swapping the operands would
30292	// cause it to handle NaNs incorrectly.
30293	if (!DAG.getTarget().Options.UnsafeFPMath &&
30294	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
30295	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
30296	break;
30297	std::swap(LHS, RHS);
30298	}
30299	Opcode = X86ISD::FMIN;
30300	break;
30301	case ISD::SETUGT:
30302	// Converting this to a min would handle NaNs incorrectly.
30303	if (!DAG.getTarget().Options.UnsafeFPMath &&
30304	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
30305	break;
30306	Opcode = X86ISD::FMIN;
30307	break;
30308	case ISD::SETUGE:
30309	// Converting this to a min would handle both negative zeros and NaNs
30310	// incorrectly, but we can swap the operands to fix both.
30311	std::swap(LHS, RHS);
30312	LLVM_FALLTHROUGH[[clang::fallthrough]];
30313	case ISD::SETOGT:
30314	case ISD::SETGT:
30315	case ISD::SETGE:
30316	Opcode = X86ISD::FMIN;
30317	break;
30318
30319	case ISD::SETULT:
30320	// Converting this to a max would handle NaNs incorrectly.
30321	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
30322	break;
30323	Opcode = X86ISD::FMAX;
30324	break;
30325	case ISD::SETOLE:
30326	// Converting this to a max would handle comparisons between positive
30327	// and negative zero incorrectly, and swapping the operands would
30328	// cause it to handle NaNs incorrectly.
30329	if (!DAG.getTarget().Options.UnsafeFPMath &&
30330	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30331	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
30332	break;
30333	std::swap(LHS, RHS);
30334	}
30335	Opcode = X86ISD::FMAX;
30336	break;
30337	case ISD::SETULE:
30338	// Converting this to a max would handle both negative zeros and NaNs
30339	// incorrectly, but we can swap the operands to fix both.
30340	std::swap(LHS, RHS);
30341	LLVM_FALLTHROUGH[[clang::fallthrough]];
30342	case ISD::SETOLT:
30343	case ISD::SETLT:
30344	case ISD::SETLE:
30345	Opcode = X86ISD::FMAX;
30346	break;
30347	}
30348	}
30349
30350	if (Opcode)
30351	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30352	}
30353
30354	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30355	// lowering on KNL. In this case we convert it to
30356	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30357	// The same situation for all 128 and 256-bit vectors of i8 and i16.
30358	// Since SKX these selects have a proper lowering.
30359	if (Subtarget.hasAVX512() && CondVT.isVector() &&
30360	CondVT.getVectorElementType() == MVT::i1 &&
30361	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
30362	(VT.getVectorElementType() == MVT::i8 \|\|
30363	VT.getVectorElementType() == MVT::i16) &&
30364	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30365	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30366	DCI.AddToWorklist(Cond.getNode());
30367	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30368	}
30369
30370	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30371	return V;
30372
30373	// Canonicalize max and min:
30374	// (x > y) ? x : y -> (x >= y) ? x : y
30375	// (x < y) ? x : y -> (x <= y) ? x : y
30376	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30377	// the need for an extra compare
30378	// against zero. e.g.
30379	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30380	// subl %esi, %edi
30381	// testl %edi, %edi
30382	// movl $0, %eax
30383	// cmovgl %edi, %eax
30384	// =>
30385	// xorl %eax, %eax
30386	// subl %esi, $edi
30387	// cmovsl %eax, %edi
30388	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30389	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30390	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30391	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30392	switch (CC) {
30393	default: break;
30394	case ISD::SETLT:
30395	case ISD::SETGT: {
30396	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30397	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30398	Cond.getOperand(0), Cond.getOperand(1), NewCC);
30399	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30400	}
30401	}
30402	}
30403
30404	// Early exit check
30405	if (!TLI.isTypeLegal(VT))
30406	return SDValue();
30407
30408	// Match VSELECTs into subs with unsigned saturation.
30409	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30410	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30411	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
30412	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
30413	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30414
30415	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
30416	// left side invert the predicate to simplify logic below.
30417	SDValue Other;
30418	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30419	Other = RHS;
30420	CC = ISD::getSetCCInverse(CC, true);
30421	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30422	Other = LHS;
30423	}
30424
30425	if (Other.getNode() && Other->getNumOperands() == 2 &&
30426	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30427	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30428	SDValue CondRHS = Cond->getOperand(1);
30429
30430	// Look for a general sub with unsigned saturation first.
30431	// x >= y ? x-y : 0 --> subus x, y
30432	// x > y ? x-y : 0 --> subus x, y
30433	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
30434	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30435	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30436
30437	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30438	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30439	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30440	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30441	// If the RHS is a constant we have to reverse the const
30442	// canonicalization.
30443	// x > C-1 ? x+-C : 0 --> subus x, C
30444	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30445	CondRHSConst->getAPIntValue() ==
30446	(-OpRHSConst->getAPIntValue() - 1))
30447	return DAG.getNode(
30448	X86ISD::SUBUS, DL, VT, OpLHS,
30449	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30450
30451	// Another special case: If C was a sign bit, the sub has been
30452	// canonicalized into a xor.
30453	// FIXME: Would it be better to use computeKnownBits to determine
30454	// whether it's safe to decanonicalize the xor?
30455	// x s< 0 ? x^C : 0 --> subus x, C
30456	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30457	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30458	OpRHSConst->getAPIntValue().isSignMask())
30459	// Note that we have to rebuild the RHS constant here to ensure we
30460	// don't rely on particular values of undef lanes.
30461	return DAG.getNode(
30462	X86ISD::SUBUS, DL, VT, OpLHS,
30463	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30464	}
30465	}
30466	}
30467
30468	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30469	return V;
30470
30471	// If this is a dynamic select (non-constant condition) and we can match
30472	// this node with one of the variable blend instructions, restructure the
30473	// condition so that blends can use the high (sign) bit of each element and
30474	// use SimplifyDemandedBits to simplify the condition operand.
30475	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30476	!DCI.isBeforeLegalize() &&
30477	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30478	unsigned BitWidth = Cond.getScalarValueSizeInBits();
30479
30480	// Don't optimize vector selects that map to mask-registers.
30481	if (BitWidth == 1)
30482	return SDValue();
30483
30484	// We can only handle the cases where VSELECT is directly legal on the
30485	// subtarget. We custom lower VSELECT nodes with constant conditions and
30486	// this makes it hard to see whether a dynamic VSELECT will correctly
30487	// lower, so we both check the operation's status and explicitly handle the
30488	// cases where a dynamic blend will fail even though a constant-condition
30489	// blend could be custom lowered.
30490	// FIXME: We should find a better way to handle this class of problems.
30491	// Potentially, we should combine constant-condition vselect nodes
30492	// pre-legalization into shuffles and not mark as many types as custom
30493	// lowered.
30494	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30495	return SDValue();
30496	// FIXME: We don't support i16-element blends currently. We could and
30497	// should support them by making all the bits in the condition be set
30498	// rather than just the high bit and using an i8-element blend.
30499	if (VT.getVectorElementType() == MVT::i16)
30500	return SDValue();
30501	// Dynamic blending was only available from SSE4.1 onward.
30502	if (VT.is128BitVector() && !Subtarget.hasSSE41())
30503	return SDValue();
30504	// Byte blends are only available in AVX2
30505	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30506	return SDValue();
30507
30508	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size")((BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size" ) ? static_cast<void> (0) : __assert_fail ("BitWidth >= 8 && BitWidth <= 64 && \"Invalid mask size\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 30508, __PRETTY_FUNCTION__));
30509	APInt DemandedMask(APInt::getSignMask(BitWidth));
30510	KnownBits Known;
30511	TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30512	DCI.isBeforeLegalizeOps());
30513	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
30514	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30515	// If we changed the computation somewhere in the DAG, this change will
30516	// affect all users of Cond. Make sure it is fine and update all the nodes
30517	// so that we do not use the generic VSELECT anymore. Otherwise, we may
30518	// perform wrong optimizations as we messed with the actual expectation
30519	// for the vector boolean values.
30520	if (Cond != TLO.Old) {
30521	// Check all uses of the condition operand to check whether it will be
30522	// consumed by non-BLEND instructions. Those may require that all bits
30523	// are set properly.
30524	for (SDNode *U : Cond->uses()) {
30525	// TODO: Add other opcodes eventually lowered into BLEND.
30526	if (U->getOpcode() != ISD::VSELECT)
30527	return SDValue();
30528	}
30529
30530	// Update all users of the condition before committing the change, so
30531	// that the VSELECT optimizations that expect the correct vector boolean
30532	// value will not be triggered.
30533	for (SDNode *U : Cond->uses()) {
30534	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30535	U->getValueType(0), Cond, U->getOperand(1),
30536	U->getOperand(2));
30537	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30538	}
30539	DCI.CommitTargetLoweringOpt(TLO);
30540	return SDValue();
30541	}
30542	// Only Cond (rather than other nodes in the computation chain) was
30543	// changed. Change the condition just for N to keep the opportunity to
30544	// optimize all other users their own way.
30545	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30546	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30547	return SDValue();
30548	}
30549	}
30550
30551	// Look for vselects with LHS/RHS being bitcasted from an operation that
30552	// can be executed on another type. Push the bitcast to the inputs of
30553	// the operation. This exposes opportunities for using masking instructions.
30554	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30555	CondVT.getVectorElementType() == MVT::i1) {
30556	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30557	return SDValue(N, 0);
30558	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30559	return SDValue(N, 0);
30560	}
30561
30562	return SDValue();
30563	}
30564
30565	/// Combine:
30566	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30567	/// to:
30568	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30569	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30570	/// Note that this is only legal for some op/cc combinations.
30571	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30572	SelectionDAG &DAG) {
30573	// This combine only operates on CMP-like nodes.
30574	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
30575	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30576	return SDValue();
30577
30578	// Can't replace the cmp if it has more uses than the one we're looking at.
30579	// FIXME: We would like to be able to handle this, but would need to make sure
30580	// all uses were updated.
30581	if (!Cmp.hasOneUse())
30582	return SDValue();
30583
30584	// This only applies to variations of the common case:
30585	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30586	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30587	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30588	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30589	// Using the proper condcodes (see below), overflow is checked for.
30590
30591	// FIXME: We can generalize both constraints:
30592	// - XOR/OR/AND (if they were made to survive AtomicExpand)
30593	// - LHS != 1
30594	// if the result is compared.
30595
30596	SDValue CmpLHS = Cmp.getOperand(0);
30597	SDValue CmpRHS = Cmp.getOperand(1);
30598
30599	if (!CmpLHS.hasOneUse())
30600	return SDValue();
30601
30602	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30603	if (!CmpRHSC \|\| CmpRHSC->getZExtValue() != 0)
30604	return SDValue();
30605
30606	const unsigned Opc = CmpLHS.getOpcode();
30607
30608	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30609	return SDValue();
30610
30611	SDValue OpRHS = CmpLHS.getOperand(2);
30612	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30613	if (!OpRHSC)
30614	return SDValue();
30615
30616	APInt Addend = OpRHSC->getAPIntValue();
30617	if (Opc == ISD::ATOMIC_LOAD_SUB)
30618	Addend = -Addend;
30619
30620	if (CC == X86::COND_S && Addend == 1)
30621	CC = X86::COND_LE;
30622	else if (CC == X86::COND_NS && Addend == 1)
30623	CC = X86::COND_G;
30624	else if (CC == X86::COND_G && Addend == -1)
30625	CC = X86::COND_GE;
30626	else if (CC == X86::COND_LE && Addend == -1)
30627	CC = X86::COND_L;
30628	else
30629	return SDValue();
30630
30631	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30632	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30633	DAG.getUNDEF(CmpLHS.getValueType()));
30634	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30635	return LockOp;
30636	}
30637
30638	// Check whether a boolean test is testing a boolean value generated by
30639	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30640	// code.
30641	//
30642	// Simplify the following patterns:
30643	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30644	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30645	// to (Op EFLAGS Cond)
30646	//
30647	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30648	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30649	// to (Op EFLAGS !Cond)
30650	//
30651	// where Op could be BRCOND or CMOV.
30652	//
30653	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30654	// This combine only operates on CMP-like nodes.
30655	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
30656	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30657	return SDValue();
30658
30659	// Quit if not used as a boolean value.
30660	if (CC != X86::COND_E && CC != X86::COND_NE)
30661	return SDValue();
30662
30663	// Check CMP operands. One of them should be 0 or 1 and the other should be
30664	// an SetCC or extended from it.
30665	SDValue Op1 = Cmp.getOperand(0);
30666	SDValue Op2 = Cmp.getOperand(1);
30667
30668	SDValue SetCC;
30669	const ConstantSDNode* C = nullptr;
30670	bool needOppositeCond = (CC == X86::COND_E);
30671	bool checkAgainstTrue = false; // Is it a comparison against 1?
30672
30673	if ((C = dyn_cast<ConstantSDNode>(Op1)))
30674	SetCC = Op2;
30675	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30676	SetCC = Op1;
30677	else // Quit if all operands are not constants.
30678	return SDValue();
30679
30680	if (C->getZExtValue() == 1) {
30681	needOppositeCond = !needOppositeCond;
30682	checkAgainstTrue = true;
30683	} else if (C->getZExtValue() != 0)
30684	// Quit if the constant is neither 0 or 1.
30685	return SDValue();
30686
30687	bool truncatedToBoolWithAnd = false;
30688	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
30689	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
30690	SetCC.getOpcode() == ISD::TRUNCATE \|\|
30691	SetCC.getOpcode() == ISD::AND) {
30692	if (SetCC.getOpcode() == ISD::AND) {
30693	int OpIdx = -1;
30694	if (isOneConstant(SetCC.getOperand(0)))
30695	OpIdx = 1;
30696	if (isOneConstant(SetCC.getOperand(1)))
30697	OpIdx = 0;
30698	if (OpIdx < 0)
30699	break;
30700	SetCC = SetCC.getOperand(OpIdx);
30701	truncatedToBoolWithAnd = true;
30702	} else
30703	SetCC = SetCC.getOperand(0);
30704	}
30705
30706	switch (SetCC.getOpcode()) {
30707	case X86ISD::SETCC_CARRY:
30708	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30709	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30710	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
30711	// truncated to i1 using 'and'.
30712	if (checkAgainstTrue && !truncatedToBoolWithAnd)
30713	break;
30714	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!") ? static_cast<void > (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 30715, __PRETTY_FUNCTION__))
30715	"Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!") ? static_cast<void > (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 30715, __PRETTY_FUNCTION__));
30716	LLVM_FALLTHROUGH[[clang::fallthrough]];
30717	case X86ISD::SETCC:
30718	// Set the condition code or opposite one if necessary.
30719	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30720	if (needOppositeCond)
30721	CC = X86::GetOppositeBranchCondition(CC);
30722	return SetCC.getOperand(1);
30723	case X86ISD::CMOV: {
30724	// Check whether false/true value has canonical one, i.e. 0 or 1.
30725	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30726	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30727	// Quit if true value is not a constant.
30728	if (!TVal)
30729	return SDValue();
30730	// Quit if false value is not a constant.
30731	if (!FVal) {
30732	SDValue Op = SetCC.getOperand(0);
30733	// Skip 'zext' or 'trunc' node.
30734	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
30735	Op.getOpcode() == ISD::TRUNCATE)
30736	Op = Op.getOperand(0);
30737	// A special case for rdrand/rdseed, where 0 is set if false cond is
30738	// found.
30739	if ((Op.getOpcode() != X86ISD::RDRAND &&
30740	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
30741	return SDValue();
30742	}
30743	// Quit if false value is not the constant 0 or 1.
30744	bool FValIsFalse = true;
30745	if (FVal && FVal->getZExtValue() != 0) {
30746	if (FVal->getZExtValue() != 1)
30747	return SDValue();
30748	// If FVal is 1, opposite cond is needed.
30749	needOppositeCond = !needOppositeCond;
30750	FValIsFalse = false;
30751	}
30752	// Quit if TVal is not the constant opposite of FVal.
30753	if (FValIsFalse && TVal->getZExtValue() != 1)
30754	return SDValue();
30755	if (!FValIsFalse && TVal->getZExtValue() != 0)
30756	return SDValue();
30757	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30758	if (needOppositeCond)
30759	CC = X86::GetOppositeBranchCondition(CC);
30760	return SetCC.getOperand(3);
30761	}
30762	}
30763
30764	return SDValue();
30765	}
30766
30767	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30768	/// Match:
30769	/// (X86or (X86setcc) (X86setcc))
30770	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
30771	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30772	X86::CondCode &CC1, SDValue &Flags,
30773	bool &isAnd) {
30774	if (Cond->getOpcode() == X86ISD::CMP) {
30775	if (!isNullConstant(Cond->getOperand(1)))
30776	return false;
30777
30778	Cond = Cond->getOperand(0);
30779	}
30780
30781	isAnd = false;
30782
30783	SDValue SetCC0, SetCC1;
30784	switch (Cond->getOpcode()) {
30785	default: return false;
30786	case ISD::AND:
30787	case X86ISD::AND:
30788	isAnd = true;
30789	LLVM_FALLTHROUGH[[clang::fallthrough]];
30790	case ISD::OR:
30791	case X86ISD::OR:
30792	SetCC0 = Cond->getOperand(0);
30793	SetCC1 = Cond->getOperand(1);
30794	break;
30795	};
30796
30797	// Make sure we have SETCC nodes, using the same flags value.
30798	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
30799	SetCC1.getOpcode() != X86ISD::SETCC \|\|
30800	SetCC0->getOperand(1) != SetCC1->getOperand(1))
30801	return false;
30802
30803	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30804	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30805	Flags = SetCC0->getOperand(1);
30806	return true;
30807	}
30808
30809	/// Optimize an EFLAGS definition used according to the condition code \p CC
30810	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30811	/// uses of chain values.
30812	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30813	SelectionDAG &DAG) {
30814	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30815	return R;
30816	return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30817	}
30818
30819	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30820	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30821	TargetLowering::DAGCombinerInfo &DCI,
30822	const X86Subtarget &Subtarget) {
30823	SDLoc DL(N);
30824
30825	// If the flag operand isn't dead, don't touch this CMOV.
30826	if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30827	return SDValue();
30828
30829	SDValue FalseOp = N->getOperand(0);
30830	SDValue TrueOp = N->getOperand(1);
30831	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30832	SDValue Cond = N->getOperand(3);
30833
30834	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
30835	switch (Cond.getOpcode()) {
30836	default: break;
30837	case X86ISD::BSR:
30838	case X86ISD::BSF:
30839	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30840	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30841	return (CC == X86::COND_E) ? FalseOp : TrueOp;
30842	}
30843	}
30844
30845	// Try to simplify the EFLAGS and condition code operands.
30846	// We can't always do this as FCMOV only supports a subset of X86 cond.
30847	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30848	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
30849	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30850	Flags};
30851	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30852	}
30853	}
30854
30855	// If this is a select between two integer constants, try to do some
30856	// optimizations. Note that the operands are ordered the opposite of SELECT
30857	// operands.
30858	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30859	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30860	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30861	// larger than FalseC (the false value).
30862	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30863	CC = X86::GetOppositeBranchCondition(CC);
30864	std::swap(TrueC, FalseC);
30865	std::swap(TrueOp, FalseOp);
30866	}
30867
30868	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30869	// This is efficient for any integer data type (including i8/i16) and
30870	// shift amount.
30871	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30872	Cond = getSETCC(CC, Cond, DL, DAG);
30873
30874	// Zero extend the condition if needed.
30875	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30876
30877	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30878	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30879	DAG.getConstant(ShAmt, DL, MVT::i8));
30880	if (N->getNumValues() == 2) // Dead flag value?
30881	return DCI.CombineTo(N, Cond, SDValue());
30882	return Cond;
30883	}
30884
30885	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30886	// for any integer data type, including i8/i16.
30887	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30888	Cond = getSETCC(CC, Cond, DL, DAG);
30889
30890	// Zero extend the condition if needed.
30891	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30892	FalseC->getValueType(0), Cond);
30893	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30894	SDValue(FalseC, 0));
30895
30896	if (N->getNumValues() == 2) // Dead flag value?
30897	return DCI.CombineTo(N, Cond, SDValue());
30898	return Cond;
30899	}
30900
30901	// Optimize cases that will turn into an LEA instruction. This requires
30902	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30903	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
30904	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30905	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30906
30907	bool isFastMultiplier = false;
30908	if (Diff < 10) {
30909	switch ((unsigned char)Diff) {
30910	default: break;
30911	case 1: // result = add base, cond
30912	case 2: // result = lea base( , cond*2)
30913	case 3: // result = lea base(cond, cond*2)
30914	case 4: // result = lea base( , cond*4)
30915	case 5: // result = lea base(cond, cond*4)
30916	case 8: // result = lea base( , cond*8)
30917	case 9: // result = lea base(cond, cond*8)
30918	isFastMultiplier = true;
30919	break;
30920	}
30921	}
30922
30923	if (isFastMultiplier) {
30924	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30925	Cond = getSETCC(CC, Cond, DL ,DAG);
30926	// Zero extend the condition if needed.
30927	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30928	Cond);
30929	// Scale the condition by the difference.
30930	if (Diff != 1)
30931	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30932	DAG.getConstant(Diff, DL, Cond.getValueType()));
30933
30934	// Add the base if non-zero.
30935	if (FalseC->getAPIntValue() != 0)
30936	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30937	SDValue(FalseC, 0));
30938	if (N->getNumValues() == 2) // Dead flag value?
30939	return DCI.CombineTo(N, Cond, SDValue());
30940	return Cond;
30941	}
30942	}
30943	}
30944	}
30945
30946	// Handle these cases:
30947	// (select (x != c), e, c) -> select (x != c), e, x),
30948	// (select (x == c), c, e) -> select (x == c), x, e)
30949	// where the c is an integer constant, and the "select" is the combination
30950	// of CMOV and CMP.
30951	//
30952	// The rationale for this change is that the conditional-move from a constant
30953	// needs two instructions, however, conditional-move from a register needs
30954	// only one instruction.
30955	//
30956	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
30957	// some instruction-combining opportunities. This opt needs to be
30958	// postponed as late as possible.
30959	//
30960	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30961	// the DCI.xxxx conditions are provided to postpone the optimization as
30962	// late as possible.
30963
30964	ConstantSDNode *CmpAgainst = nullptr;
30965	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
30966	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30967	!isa<ConstantSDNode>(Cond.getOperand(0))) {
30968
30969	if (CC == X86::COND_NE &&
30970	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30971	CC = X86::GetOppositeBranchCondition(CC);
30972	std::swap(TrueOp, FalseOp);
30973	}
30974
30975	if (CC == X86::COND_E &&
30976	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30977	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30978	DAG.getConstant(CC, DL, MVT::i8), Cond };
30979	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30980	}
30981	}
30982	}
30983
30984	// Fold and/or of setcc's to double CMOV:
30985	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30986	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30987	//
30988	// This combine lets us generate:
30989	// cmovcc1 (jcc1 if we don't have CMOV)
30990	// cmovcc2 (same)
30991	// instead of:
30992	// setcc1
30993	// setcc2
30994	// and/or
30995	// cmovne (jne if we don't have CMOV)
30996	// When we can't use the CMOV instruction, it might increase branch
30997	// mispredicts.
30998	// When we can use CMOV, or when there is no mispredict, this improves
30999	// throughput and reduces register pressure.
31000	//
31001	if (CC == X86::COND_NE) {
31002	SDValue Flags;
31003	X86::CondCode CC0, CC1;
31004	bool isAndSetCC;
31005	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31006	if (isAndSetCC) {
31007	std::swap(FalseOp, TrueOp);
31008	CC0 = X86::GetOppositeBranchCondition(CC0);
31009	CC1 = X86::GetOppositeBranchCondition(CC1);
31010	}
31011
31012	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31013	Flags};
31014	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31015	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31016	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31017	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31018	return CMOV;
31019	}
31020	}
31021
31022	return SDValue();
31023	}
31024
31025	/// Different mul shrinking modes.
31026	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31027
31028	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31029	EVT VT = N->getOperand(0).getValueType();
31030	if (VT.getScalarSizeInBits() != 32)
31031	return false;
31032
31033	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2" ) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31033, __PRETTY_FUNCTION__));
31034	unsigned SignBits[2] = {1, 1};
31035	bool IsPositive[2] = {false, false};
31036	for (unsigned i = 0; i < 2; i++) {
31037	SDValue Opd = N->getOperand(i);
31038
31039	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31040	// compute signbits for it separately.
31041	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31042	// For anyextend, it is safe to assume an appropriate number of leading
31043	// sign/zero bits.
31044	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31045	SignBits[i] = 25;
31046	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31047	MVT::i16)
31048	SignBits[i] = 17;
31049	else
31050	return false;
31051	IsPositive[i] = true;
31052	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31053	// All the operands of BUILD_VECTOR need to be int constant.
31054	// Find the smallest value range which all the operands belong to.
31055	SignBits[i] = 32;
31056	IsPositive[i] = true;
31057	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31058	if (SubOp.isUndef())
31059	continue;
31060	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31061	if (!CN)
31062	return false;
31063	APInt IntVal = CN->getAPIntValue();
31064	if (IntVal.isNegative())
31065	IsPositive[i] = false;
31066	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31067	}
31068	} else {
31069	SignBits[i] = DAG.ComputeNumSignBits(Opd);
31070	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31071	IsPositive[i] = true;
31072	}
31073	}
31074
31075	bool AllPositive = IsPositive[0] && IsPositive[1];
31076	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31077	// When ranges are from -128 ~ 127, use MULS8 mode.
31078	if (MinSignBits >= 25)
31079	Mode = MULS8;
31080	// When ranges are from 0 ~ 255, use MULU8 mode.
31081	else if (AllPositive && MinSignBits >= 24)
31082	Mode = MULU8;
31083	// When ranges are from -32768 ~ 32767, use MULS16 mode.
31084	else if (MinSignBits >= 17)
31085	Mode = MULS16;
31086	// When ranges are from 0 ~ 65535, use MULU16 mode.
31087	else if (AllPositive && MinSignBits >= 16)
31088	Mode = MULU16;
31089	else
31090	return false;
31091	return true;
31092	}
31093
31094	/// When the operands of vector mul are extended from smaller size values,
31095	/// like i8 and i16, the type of mul may be shrinked to generate more
31096	/// efficient code. Two typical patterns are handled:
31097	/// Pattern1:
31098	/// %2 = sext/zext <N x i8> %1 to <N x i32>
31099	/// %4 = sext/zext <N x i8> %3 to <N x i32>
31100	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31101	/// %5 = mul <N x i32> %2, %4
31102	///
31103	/// Pattern2:
31104	/// %2 = zext/sext <N x i16> %1 to <N x i32>
31105	/// %4 = zext/sext <N x i16> %3 to <N x i32>
31106	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31107	/// %5 = mul <N x i32> %2, %4
31108	///
31109	/// There are four mul shrinking modes:
31110	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31111	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31112	/// generate pmullw+sext32 for it (MULS8 mode).
31113	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31114	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31115	/// generate pmullw+zext32 for it (MULU8 mode).
31116	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31117	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31118	/// generate pmullw+pmulhw for it (MULS16 mode).
31119	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31120	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31121	/// generate pmullw+pmulhuw for it (MULU16 mode).
31122	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31123	const X86Subtarget &Subtarget) {
31124	// Check for legality
31125	// pmullw/pmulhw are not supported by SSE.
31126	if (!Subtarget.hasSSE2())
31127	return SDValue();
31128
31129	// Check for profitability
31130	// pmulld is supported since SSE41. It is better to use pmulld
31131	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31132	// the expansion.
31133	bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31134	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
31135	return SDValue();
31136
31137	ShrinkMode Mode;
31138	if (!canReduceVMulWidth(N, DAG, Mode))
31139	return SDValue();
31140
31141	SDLoc DL(N);
31142	SDValue N0 = N->getOperand(0);
31143	SDValue N1 = N->getOperand(1);
31144	EVT VT = N->getOperand(0).getValueType();
31145	unsigned RegSize = 128;
31146	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31147	EVT ReducedVT =
31148	EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31149	// Shrink the operands of mul.
31150	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31151	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31152
31153	if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31154	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31155	// lower part is needed.
31156	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31157	if (Mode == MULU8 \|\| Mode == MULS8) {
31158	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31159	DL, VT, MulLo);
31160	} else {
31161	MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31162	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31163	// the higher part is also needed.
31164	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31165	ReducedVT, NewN0, NewN1);
31166
31167	// Repack the lower part and higher part result of mul into a wider
31168	// result.
31169	// Generate shuffle functioning as punpcklwd.
31170	SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31171	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31172	ShuffleMask[2 * i] = i;
31173	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31174	}
31175	SDValue ResLo =
31176	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31177	ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31178	// Generate shuffle functioning as punpckhwd.
31179	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31180	ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31181	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31182	}
31183	SDValue ResHi =
31184	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31185	ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31186	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31187	}
31188	} else {
31189	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31190	// to legalize the mul explicitly because implicit legalization for type
31191	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31192	// instructions which will not exist when we explicitly legalize it by
31193	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31194	// <4 x i16> undef).
31195	//
31196	// Legalize the operands of mul.
31197	// FIXME: We may be able to handle non-concatenated vectors by insertion.
31198	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31199	if ((RegSize % ReducedSizeInBits) != 0)
31200	return SDValue();
31201
31202	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31203	DAG.getUNDEF(ReducedVT));
31204	Ops[0] = NewN0;
31205	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31206	Ops[0] = NewN1;
31207	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31208
31209	if (Mode == MULU8 \|\| Mode == MULS8) {
31210	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31211	// part is needed.
31212	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31213
31214	// convert the type of mul result to VT.
31215	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31216	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31217	: ISD::SIGN_EXTEND_VECTOR_INREG,
31218	DL, ResVT, Mul);
31219	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31220	DAG.getIntPtrConstant(0, DL));
31221	} else {
31222	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31223	// MULU16/MULS16, both parts are needed.
31224	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31225	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31226	OpsVT, NewN0, NewN1);
31227
31228	// Repack the lower part and higher part result of mul into a wider
31229	// result. Make sure the type of mul result is VT.
31230	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31231	SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31232	Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31233	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31234	DAG.getIntPtrConstant(0, DL));
31235	}
31236	}
31237	}
31238
31239	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31240	EVT VT, SDLoc DL) {
31241
31242	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31243	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31244	DAG.getConstant(Mult, DL, VT));
31245	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31246	DAG.getConstant(Shift, DL, MVT::i8));
31247	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31248	N->getOperand(0));
31249	return Result;
31250	};
31251
31252	auto combineMulMulAddOrSub = [&](bool isAdd) {
31253	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31254	DAG.getConstant(9, DL, VT));
31255	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31256	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31257	N->getOperand(0));
31258	return Result;
31259	};
31260
31261	switch (MulAmt) {
31262	default:
31263	break;
31264	case 11:
31265	// mul x, 11 => add ((shl (mul x, 5), 1), x)
31266	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
31267	case 21:
31268	// mul x, 21 => add ((shl (mul x, 5), 2), x)
31269	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
31270	case 22:
31271	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31272	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31273	combineMulShlAddOrSub(5, 2, /isAdd/ true));
31274	case 19:
31275	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
31276	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
31277	case 13:
31278	// mul x, 13 => add ((shl (mul x, 3), 2), x)
31279	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
31280	case 23:
31281	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
31282	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
31283	case 14:
31284	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31285	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31286	combineMulShlAddOrSub(3, 2, /isAdd/ true));
31287	case 26:
31288	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
31289	return combineMulMulAddOrSub(/isAdd/ false);
31290	case 28:
31291	// mul x, 28 => add ((mul (mul x, 9), 3), x)
31292	return combineMulMulAddOrSub(/isAdd/ true);
31293	case 29:
31294	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31295	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31296	combineMulMulAddOrSub(/isAdd/ true));
31297	case 30:
31298	// mul x, 30 => sub (sub ((shl x, 5), x), x)
31299	return DAG.getNode(
31300	ISD::SUB, DL, VT,
31301	DAG.getNode(ISD::SUB, DL, VT,
31302	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31303	DAG.getConstant(5, DL, MVT::i8)),
31304	N->getOperand(0)),
31305	N->getOperand(0));
31306	}
31307	return SDValue();
31308	}
31309
31310	/// Optimize a single multiply with constant into two operations in order to
31311	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31312	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31313	TargetLowering::DAGCombinerInfo &DCI,
31314	const X86Subtarget &Subtarget) {
31315	EVT VT = N->getValueType(0);
31316	if (DCI.isBeforeLegalize() && VT.isVector())
31317	return reduceVMULWidth(N, DAG, Subtarget);
31318
31319	if (!MulConstantOptimization)
31320	return SDValue();
31321	// An imul is usually smaller than the alternative sequence.
31322	if (DAG.getMachineFunction().getFunction()->optForMinSize())
31323	return SDValue();
31324
31325	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
31326	return SDValue();
31327
31328	if (VT != MVT::i64 && VT != MVT::i32)
31329	return SDValue();
31330
31331	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31332	if (!C)
31333	return SDValue();
31334	uint64_t MulAmt = C->getZExtValue();
31335	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
31336	return SDValue();
31337
31338	uint64_t MulAmt1 = 0;
31339	uint64_t MulAmt2 = 0;
31340	if ((MulAmt % 9) == 0) {
31341	MulAmt1 = 9;
31342	MulAmt2 = MulAmt / 9;
31343	} else if ((MulAmt % 5) == 0) {
31344	MulAmt1 = 5;
31345	MulAmt2 = MulAmt / 5;
31346	} else if ((MulAmt % 3) == 0) {
31347	MulAmt1 = 3;
31348	MulAmt2 = MulAmt / 3;
31349	}
31350
31351	SDLoc DL(N);
31352	SDValue NewMul;
31353	if (MulAmt2 &&
31354	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){
31355
31356	if (isPowerOf2_64(MulAmt2) &&
31357	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31358	// If second multiplifer is pow2, issue it first. We want the multiply by
31359	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
31360	// is an add.
31361	std::swap(MulAmt1, MulAmt2);
31362
31363	if (isPowerOf2_64(MulAmt1))
31364	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31365	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31366	else
31367	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31368	DAG.getConstant(MulAmt1, DL, VT));
31369
31370	if (isPowerOf2_64(MulAmt2))
31371	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31372	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31373	else
31374	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31375	DAG.getConstant(MulAmt2, DL, VT));
31376	} else if (!Subtarget.slowLEA())
31377	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31378
31379	if (!NewMul) {
31380	assert(MulAmt != 0 &&((MulAmt != 0 && MulAmt != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? static_cast<void> (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31383, __PRETTY_FUNCTION__))
31381	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((MulAmt != 0 && MulAmt != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? static_cast<void> (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31383, __PRETTY_FUNCTION__))
31382	"Both cases that could cause potential overflows should have "((MulAmt != 0 && MulAmt != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? static_cast<void> (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31383, __PRETTY_FUNCTION__))
31383	"already been handled.")((MulAmt != 0 && MulAmt != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? static_cast<void> (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31383, __PRETTY_FUNCTION__));
31384	int64_t SignMulAmt = C->getSExtValue();
31385	if ((SignMulAmt != INT64_MIN(-9223372036854775807L -1)) && (SignMulAmt != INT64_MAX(9223372036854775807L)) &&
31386	(SignMulAmt != -INT64_MAX(9223372036854775807L))) {
31387	int NumSign = SignMulAmt > 0 ? 1 : -1;
31388	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31389	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31390	if (IsPowerOf2_64PlusOne) {
31391	// (mul x, 2^N + 1) => (add (shl x, N), x)
31392	NewMul = DAG.getNode(
31393	ISD::ADD, DL, VT, N->getOperand(0),
31394	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31395	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31396	MVT::i8)));
31397	} else if (IsPowerOf2_64MinusOne) {
31398	// (mul x, 2^N - 1) => (sub (shl x, N), x)
31399	NewMul = DAG.getNode(
31400	ISD::SUB, DL, VT,
31401	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31402	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31403	MVT::i8)),
31404	N->getOperand(0));
31405	}
31406	// To negate, subtract the number from zero
31407	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
31408	NewMul =
31409	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31410	}
31411	}
31412
31413	if (NewMul)
31414	// Do not add new nodes to DAG combiner worklist.
31415	DCI.CombineTo(N, NewMul, false);
31416
31417	return SDValue();
31418	}
31419
31420	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31421	SDValue N0 = N->getOperand(0);
31422	SDValue N1 = N->getOperand(1);
31423	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31424	EVT VT = N0.getValueType();
31425
31426	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31427	// since the result of setcc_c is all zero's or all ones.
31428	if (VT.isInteger() && !VT.isVector() &&
31429	N1C && N0.getOpcode() == ISD::AND &&
31430	N0.getOperand(1).getOpcode() == ISD::Constant) {
31431	SDValue N00 = N0.getOperand(0);
31432	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31433	Mask <<= N1C->getAPIntValue();
31434	bool MaskOK = false;
31435	// We can handle cases concerning bit-widening nodes containing setcc_c if
31436	// we carefully interrogate the mask to make sure we are semantics
31437	// preserving.
31438	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31439	// of the underlying setcc_c operation if the setcc_c was zero extended.
31440	// Consider the following example:
31441	// zext(setcc_c) -> i32 0x0000FFFF
31442	// c1 -> i32 0x0000FFFF
31443	// c2 -> i32 0x00000001
31444	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31445	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31446	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31447	MaskOK = true;
31448	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31449	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31450	MaskOK = true;
31451	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
31452	N00.getOpcode() == ISD::ANY_EXTEND) &&
31453	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31454	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31455	}
31456	if (MaskOK && Mask != 0) {
31457	SDLoc DL(N);
31458	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31459	}
31460	}
31461
31462	// Hardware support for vector shifts is sparse which makes us scalarize the
31463	// vector operations in many cases. Also, on sandybridge ADD is faster than
31464	// shl.
31465	// (shl V, 1) -> add V,V
31466	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31467	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31468	assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type" ) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31468, __PRETTY_FUNCTION__));
31469	// We shift all of the values by one. In many cases we do not have
31470	// hardware support for this operation. This is better expressed as an ADD
31471	// of two values.
31472	if (N1SplatC->getAPIntValue() == 1)
31473	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31474	}
31475
31476	return SDValue();
31477	}
31478
31479	static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31480	SDValue N0 = N->getOperand(0);
31481	SDValue N1 = N->getOperand(1);
31482	EVT VT = N0.getValueType();
31483	unsigned Size = VT.getSizeInBits();
31484
31485	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31486	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31487	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31488	// depending on sign of (SarConst - [56,48,32,24,16])
31489
31490	// sexts in X86 are MOVs. The MOVs have the same code size
31491	// as above SHIFTs (only SHIFT on 1 has lower code size).
31492	// However the MOVs have 2 advantages to a SHIFT:
31493	// 1. MOVs can write to a register that differs from source
31494	// 2. MOVs accept memory operands
31495
31496	if (!VT.isInteger() \|\| VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
31497	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
31498	N0.getOperand(1).getOpcode() != ISD::Constant)
31499	return SDValue();
31500
31501	SDValue N00 = N0.getOperand(0);
31502	SDValue N01 = N0.getOperand(1);
31503	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31504	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31505	EVT CVT = N1.getValueType();
31506
31507	if (SarConst.isNegative())
31508	return SDValue();
31509
31510	for (MVT SVT : MVT::integer_valuetypes()) {
31511	unsigned ShiftSize = SVT.getSizeInBits();
31512	// skipping types without corresponding sext/zext and
31513	// ShlConst that is not one of [56,48,32,24,16]
31514	if (ShiftSize < 8 \|\| ShiftSize > 64 \|\| ShlConst != Size - ShiftSize)
31515	continue;
31516	SDLoc DL(N);
31517	SDValue NN =
31518	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31519	SarConst = SarConst - (Size - ShiftSize);
31520	if (SarConst == 0)
31521	return NN;
31522	else if (SarConst.isNegative())
31523	return DAG.getNode(ISD::SHL, DL, VT, NN,
31524	DAG.getConstant(-SarConst, DL, CVT));
31525	else
31526	return DAG.getNode(ISD::SRA, DL, VT, NN,
31527	DAG.getConstant(SarConst, DL, CVT));
31528	}
31529	return SDValue();
31530	}
31531
31532	/// \brief Returns a vector of 0s if the node in input is a vector logical
31533	/// shift by a constant amount which is known to be bigger than or equal
31534	/// to the vector element size in bits.
31535	static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31536	const X86Subtarget &Subtarget) {
31537	EVT VT = N->getValueType(0);
31538
31539	if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31540	(!Subtarget.hasInt256() \|\|
31541	(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31542	return SDValue();
31543
31544	SDValue Amt = N->getOperand(1);
31545	SDLoc DL(N);
31546	if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31547	if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31548	const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31549	unsigned MaxAmount =
31550	VT.getSimpleVT().getScalarSizeInBits();
31551
31552	// SSE2/AVX2 logical shifts always return a vector of 0s
31553	// if the shift amount is bigger than or equal to
31554	// the element size. The constant shift amount will be
31555	// encoded as a 8-bit immediate.
31556	if (ShiftAmt.trunc(8).uge(MaxAmount))
31557	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31558	}
31559
31560	return SDValue();
31561	}
31562
31563	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31564	TargetLowering::DAGCombinerInfo &DCI,
31565	const X86Subtarget &Subtarget) {
31566	if (N->getOpcode() == ISD::SHL)
31567	if (SDValue V = combineShiftLeft(N, DAG))
31568	return V;
31569
31570	if (N->getOpcode() == ISD::SRA)
31571	if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31572	return V;
31573
31574	// Try to fold this logical shift into a zero vector.
31575	if (N->getOpcode() != ISD::SRA)
31576	if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31577	return V;
31578
31579	return SDValue();
31580	}
31581
31582	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31583	TargetLowering::DAGCombinerInfo &DCI,
31584	const X86Subtarget &Subtarget) {
31585	unsigned Opcode = N->getOpcode();
31586	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|(((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD ::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast <void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31588, __PRETTY_FUNCTION__))
31587	X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD ::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast <void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31588, __PRETTY_FUNCTION__))
31588	"Unexpected shift opcode")(((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD ::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast <void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\| X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31588, __PRETTY_FUNCTION__));
31589	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
31590	EVT VT = N->getValueType(0);
31591	SDValue N0 = N->getOperand(0);
31592	SDValue N1 = N->getOperand(1);
31593	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31594	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type") ? static_cast<void> (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31595, __PRETTY_FUNCTION__))
31595	"Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type") ? static_cast<void> (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31595, __PRETTY_FUNCTION__));
31596
31597	// Out of range logical bit shifts are guaranteed to be zero.
31598	// Out of range arithmetic bit shifts splat the sign bit.
31599	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31600	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31601	if (LogicalShift)
31602	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31603	else
31604	ShiftVal = NumBitsPerElt - 1;
31605	}
31606
31607	// Shift N0 by zero -> N0.
31608	if (!ShiftVal)
31609	return N0;
31610
31611	// Shift zero -> zero.
31612	if (ISD::isBuildVectorAllZeros(N0.getNode()))
31613	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31614
31615	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31616	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31617	// TODO - support other sra opcodes as needed.
31618	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31619	N0.getOpcode() == X86ISD::VSRAI)
31620	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31621
31622	// We can decode 'whole byte' logical bit shifts as shuffles.
31623	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31624	SDValue Op(N, 0);
31625	SmallVector<int, 1> NonceMask; // Just a placeholder.
31626	NonceMask.push_back(0);
31627	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31628	/Depth/ 1, /HasVarMask/ false, DAG,
31629	DCI, Subtarget))
31630	return SDValue(); // This routine will use CombineTo to replace N.
31631	}
31632
31633	// Constant Folding.
31634	APInt UndefElts;
31635	SmallVector<APInt, 32> EltBits;
31636	if (N->isOnlyUserOf(N0.getNode()) &&
31637	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31638	assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type" ) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31639, __PRETTY_FUNCTION__))
31639	"Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type" ) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31639, __PRETTY_FUNCTION__));
31640	unsigned ShiftImm = ShiftVal.getZExtValue();
31641	for (APInt &Elt : EltBits) {
31642	if (X86ISD::VSHLI == Opcode)
31643	Elt <<= ShiftImm;
31644	else if (X86ISD::VSRAI == Opcode)
31645	Elt.ashrInPlace(ShiftImm);
31646	else
31647	Elt.lshrInPlace(ShiftImm);
31648	}
31649	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31650	}
31651
31652	return SDValue();
31653	}
31654
31655	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31656	TargetLowering::DAGCombinerInfo &DCI,
31657	const X86Subtarget &Subtarget) {
31658	assert(((((N->getOpcode() == X86ISD::PINSRB && N->getValueType (0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion" ) ? static_cast<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31662, __PRETTY_FUNCTION__))
31659	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|((((N->getOpcode() == X86ISD::PINSRB && N->getValueType (0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion" ) ? static_cast<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31662, __PRETTY_FUNCTION__))
31660	(N->getOpcode() == X86ISD::PINSRW &&((((N->getOpcode() == X86ISD::PINSRB && N->getValueType (0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion" ) ? static_cast<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31662, __PRETTY_FUNCTION__))
31661	N->getValueType(0) == MVT::v8i16)) &&((((N->getOpcode() == X86ISD::PINSRB && N->getValueType (0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion" ) ? static_cast<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31662, __PRETTY_FUNCTION__))
31662	"Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && N->getValueType (0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion" ) ? static_cast<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\| (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31662, __PRETTY_FUNCTION__));
31663
31664	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31665	SDValue Op(N, 0);
31666	SmallVector<int, 1> NonceMask; // Just a placeholder.
31667	NonceMask.push_back(0);
31668	combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31669	/Depth/ 1, /HasVarMask/ false, DAG,
31670	DCI, Subtarget);
31671	return SDValue();
31672	}
31673
31674	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31675	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31676	/// OR -> CMPNEQSS.
31677	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31678	TargetLowering::DAGCombinerInfo &DCI,
31679	const X86Subtarget &Subtarget) {
31680	unsigned opcode;
31681
31682	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
31683	// we're requiring SSE2 for both.
31684	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31685	SDValue N0 = N->getOperand(0);
31686	SDValue N1 = N->getOperand(1);
31687	SDValue CMP0 = N0->getOperand(1);
31688	SDValue CMP1 = N1->getOperand(1);
31689	SDLoc DL(N);
31690
31691	// The SETCCs should both refer to the same CMP.
31692	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
31693	return SDValue();
31694
31695	SDValue CMP00 = CMP0->getOperand(0);
31696	SDValue CMP01 = CMP0->getOperand(1);
31697	EVT VT = CMP00.getValueType();
31698
31699	if (VT == MVT::f32 \|\| VT == MVT::f64) {
31700	bool ExpectingFlags = false;
31701	// Check for any users that want flags:
31702	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31703	!ExpectingFlags && UI != UE; ++UI)
31704	switch (UI->getOpcode()) {
31705	default:
31706	case ISD::BR_CC:
31707	case ISD::BRCOND:
31708	case ISD::SELECT:
31709	ExpectingFlags = true;
31710	break;
31711	case ISD::CopyToReg:
31712	case ISD::SIGN_EXTEND:
31713	case ISD::ZERO_EXTEND:
31714	case ISD::ANY_EXTEND:
31715	break;
31716	}
31717
31718	if (!ExpectingFlags) {
31719	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31720	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31721
31722	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
31723	X86::CondCode tmp = cc0;
31724	cc0 = cc1;
31725	cc1 = tmp;
31726	}
31727
31728	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
31729	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31730	// FIXME: need symbolic constants for these magic numbers.
31731	// See X86ATTInstPrinter.cpp:printSSECC().
31732	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31733	if (Subtarget.hasAVX512()) {
31734	SDValue FSetCC =
31735	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31736	DAG.getConstant(x86cc, DL, MVT::i8));
31737	return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31738	FSetCC, DAG.getIntPtrConstant(0, DL));
31739	}
31740	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31741	CMP00.getValueType(), CMP00, CMP01,
31742	DAG.getConstant(x86cc, DL,
31743	MVT::i8));
31744
31745	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31746	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31747
31748	if (is64BitFP && !Subtarget.is64Bit()) {
31749	// On a 32-bit target, we cannot bitcast the 64-bit float to a
31750	// 64-bit integer, since that's not a legal type. Since
31751	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
31752	// bits, but can do this little dance to extract the lowest 32 bits
31753	// and work with those going forward.
31754	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31755	OnesOrZeroesF);
31756	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31757	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31758	Vector32, DAG.getIntPtrConstant(0, DL));
31759	IntVT = MVT::i32;
31760	}
31761
31762	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31763	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31764	DAG.getConstant(1, DL, IntVT));
31765	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31766	ANDed);
31767	return OneBitOfTruth;
31768	}
31769	}
31770	}
31771	}
31772	return SDValue();
31773	}
31774
31775	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31776	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31777	assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0 ) : __assert_fail ("N->getOpcode() == ISD::AND", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31777, __PRETTY_FUNCTION__));
31778
31779	EVT VT = N->getValueType(0);
31780	SDValue N0 = N->getOperand(0);
31781	SDValue N1 = N->getOperand(1);
31782	SDLoc DL(N);
31783
31784	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31785	return SDValue();
31786
31787	if (N0.getOpcode() == ISD::XOR &&
31788	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31789	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31790
31791	if (N1.getOpcode() == ISD::XOR &&
31792	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31793	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31794
31795	return SDValue();
31796	}
31797
31798	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31799	// register. In most cases we actually compare or select YMM-sized registers
31800	// and mixing the two types creates horrible code. This method optimizes
31801	// some of the transition sequences.
31802	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31803	TargetLowering::DAGCombinerInfo &DCI,
31804	const X86Subtarget &Subtarget) {
31805	EVT VT = N->getValueType(0);
31806	if (!VT.is256BitVector())
31807	return SDValue();
31808
31809	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|(((N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node") ? static_cast<void> (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31811, __PRETTY_FUNCTION__))
31810	N->getOpcode() == ISD::ZERO_EXTEND \|\|(((N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node") ? static_cast<void> (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31811, __PRETTY_FUNCTION__))
31811	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node") ? static_cast<void> (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND \|\| N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31811, __PRETTY_FUNCTION__));
31812
31813	SDValue Narrow = N->getOperand(0);
31814	EVT NarrowVT = Narrow->getValueType(0);
31815	if (!NarrowVT.is128BitVector())
31816	return SDValue();
31817
31818	if (Narrow->getOpcode() != ISD::XOR &&
31819	Narrow->getOpcode() != ISD::AND &&
31820	Narrow->getOpcode() != ISD::OR)
31821	return SDValue();
31822
31823	SDValue N0 = Narrow->getOperand(0);
31824	SDValue N1 = Narrow->getOperand(1);
31825	SDLoc DL(Narrow);
31826
31827	// The Left side has to be a trunc.
31828	if (N0.getOpcode() != ISD::TRUNCATE)
31829	return SDValue();
31830
31831	// The type of the truncated inputs.
31832	EVT WideVT = N0->getOperand(0)->getValueType(0);
31833	if (WideVT != VT)
31834	return SDValue();
31835
31836	// The right side has to be a 'trunc' or a constant vector.
31837	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31838	ConstantSDNode *RHSConstSplat = nullptr;
31839	if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31840	RHSConstSplat = RHSBV->getConstantSplatNode();
31841	if (!RHSTrunc && !RHSConstSplat)
31842	return SDValue();
31843
31844	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31845
31846	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31847	return SDValue();
31848
31849	// Set N0 and N1 to hold the inputs to the new wide operation.
31850	N0 = N0->getOperand(0);
31851	if (RHSConstSplat) {
31852	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31853	SDValue(RHSConstSplat, 0));
31854	N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31855	} else if (RHSTrunc) {
31856	N1 = N1->getOperand(0);
31857	}
31858
31859	// Generate the wide operation.
31860	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31861	unsigned Opcode = N->getOpcode();
31862	switch (Opcode) {
31863	case ISD::ANY_EXTEND:
31864	return Op;
31865	case ISD::ZERO_EXTEND: {
31866	unsigned InBits = NarrowVT.getScalarSizeInBits();
31867	APInt Mask = APInt::getAllOnesValue(InBits);
31868	Mask = Mask.zext(VT.getScalarSizeInBits());
31869	return DAG.getNode(ISD::AND, DL, VT,
31870	Op, DAG.getConstant(Mask, DL, VT));
31871	}
31872	case ISD::SIGN_EXTEND:
31873	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31874	Op, DAG.getValueType(NarrowVT));
31875	default:
31876	llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31876);
31877	}
31878	}
31879
31880	/// If both input operands of a logic op are being cast from floating point
31881	/// types, try to convert this into a floating point logic node to avoid
31882	/// unnecessary moves from SSE to integer registers.
31883	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31884	const X86Subtarget &Subtarget) {
31885	unsigned FPOpcode = ISD::DELETED_NODE;
31886	if (N->getOpcode() == ISD::AND)
31887	FPOpcode = X86ISD::FAND;
31888	else if (N->getOpcode() == ISD::OR)
31889	FPOpcode = X86ISD::FOR;
31890	else if (N->getOpcode() == ISD::XOR)
31891	FPOpcode = X86ISD::FXOR;
31892
31893	assert(FPOpcode != ISD::DELETED_NODE &&((FPOpcode != ISD::DELETED_NODE && "Unexpected input node for FP logic conversion" ) ? static_cast<void> (0) : __assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31894, __PRETTY_FUNCTION__))
31894	"Unexpected input node for FP logic conversion")((FPOpcode != ISD::DELETED_NODE && "Unexpected input node for FP logic conversion" ) ? static_cast<void> (0) : __assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 31894, __PRETTY_FUNCTION__));
31895
31896	EVT VT = N->getValueType(0);
31897	SDValue N0 = N->getOperand(0);
31898	SDValue N1 = N->getOperand(1);
31899	SDLoc DL(N);
31900	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31901	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
31902	(Subtarget.hasSSE2() && VT == MVT::i64))) {
31903	SDValue N00 = N0.getOperand(0);
31904	SDValue N10 = N1.getOperand(0);
31905	EVT N00Type = N00.getValueType();
31906	EVT N10Type = N10.getValueType();
31907	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31908	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31909	return DAG.getBitcast(VT, FPLogic);
31910	}
31911	}
31912	return SDValue();
31913	}
31914
31915	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
31916	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31917	/// with a shift-right to eliminate loading the vector constant mask value.
31918	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31919	const X86Subtarget &Subtarget) {
31920	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31921	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31922	EVT VT0 = Op0.getValueType();
31923	EVT VT1 = Op1.getValueType();
31924
31925	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
31926	return SDValue();
31927
31928	APInt SplatVal;
31929	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
31930	!SplatVal.isMask())
31931	return SDValue();
31932
31933	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31934	return SDValue();
31935
31936	unsigned EltBitWidth = VT0.getScalarSizeInBits();
31937	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31938	return SDValue();
31939
31940	SDLoc DL(N);
31941	unsigned ShiftVal = SplatVal.countTrailingOnes();
31942	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31943	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31944	return DAG.getBitcast(N->getValueType(0), Shift);
31945	}
31946
31947	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31948	TargetLowering::DAGCombinerInfo &DCI,
31949	const X86Subtarget &Subtarget) {
31950	if (DCI.isBeforeLegalizeOps())
31951	return SDValue();
31952
31953	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31954	return R;
31955
31956	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31957	return FPLogic;
31958
31959	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31960	return R;
31961
31962	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31963	return ShiftRight;
31964
31965	EVT VT = N->getValueType(0);
31966	SDValue N0 = N->getOperand(0);
31967	SDValue N1 = N->getOperand(1);
31968	SDLoc DL(N);
31969
31970	// Attempt to recursively combine a bitmask AND with shuffles.
31971	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31972	SDValue Op(N, 0);
31973	SmallVector<int, 1> NonceMask; // Just a placeholder.
31974	NonceMask.push_back(0);
31975	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31976	/Depth/ 1, /HasVarMask/ false, DAG,
31977	DCI, Subtarget))
31978	return SDValue(); // This routine will use CombineTo to replace N.
31979	}
31980
31981	// Create BEXTR instructions
31982	// BEXTR is ((X >> imm) & (2**size-1))
31983	if (VT != MVT::i32 && VT != MVT::i64)
31984	return SDValue();
31985
31986	if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31987	return SDValue();
31988	if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31989	return SDValue();
31990
31991	ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31992	ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31993	if (MaskNode && ShiftNode) {
31994	uint64_t Mask = MaskNode->getZExtValue();
31995	uint64_t Shift = ShiftNode->getZExtValue();
31996	if (isMask_64(Mask)) {
31997	uint64_t MaskSize = countPopulation(Mask);
31998	if (Shift + MaskSize <= VT.getSizeInBits())
31999	return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32000	DAG.getConstant(Shift \| (MaskSize << 8), DL,
32001	VT));
32002	}
32003	}
32004	return SDValue();
32005	}
32006
32007	// Try to fold:
32008	// (or (and (m, y), (pandn m, x)))
32009	// into:
32010	// (vselect m, x, y)
32011	// As a special case, try to fold:
32012	// (or (and (m, (sub 0, x)), (pandn m, x)))
32013	// into:
32014	// (sub (xor X, M), M)
32015	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32016	const X86Subtarget &Subtarget) {
32017	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode" ) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32017, __PRETTY_FUNCTION__));
32018
32019	SDValue N0 = N->getOperand(0);
32020	SDValue N1 = N->getOperand(1);
32021	EVT VT = N->getValueType(0);
32022
32023	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
32024	(VT.is256BitVector() && Subtarget.hasInt256())))
32025	return SDValue();
32026
32027	// Canonicalize AND to LHS.
32028	if (N1.getOpcode() == ISD::AND)
32029	std::swap(N0, N1);
32030
32031	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32032	// ANDNP combine allows other combines to happen that prevent matching.
32033	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
32034	return SDValue();
32035
32036	SDValue Mask = N1.getOperand(0);
32037	SDValue X = N1.getOperand(1);
32038	SDValue Y;
32039	if (N0.getOperand(0) == Mask)
32040	Y = N0.getOperand(1);
32041	if (N0.getOperand(1) == Mask)
32042	Y = N0.getOperand(0);
32043
32044	// Check to see if the mask appeared in both the AND and ANDNP.
32045	if (!Y.getNode())
32046	return SDValue();
32047
32048	// Validate that X, Y, and Mask are bitcasts, and see through them.
32049	Mask = peekThroughBitcasts(Mask);
32050	X = peekThroughBitcasts(X);
32051	Y = peekThroughBitcasts(Y);
32052
32053	EVT MaskVT = Mask.getValueType();
32054	unsigned EltBits = MaskVT.getScalarSizeInBits();
32055
32056	// TODO: Attempt to handle floating point cases as well?
32057	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
32058	return SDValue();
32059
32060	SDLoc DL(N);
32061
32062	// Try to match:
32063	// (or (and (M, (sub 0, X)), (pandn M, X)))
32064	// which is a special case of vselect:
32065	// (vselect M, (sub 0, X), X)
32066	// Per:
32067	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32068	// We know that, if fNegate is 0 or 1:
32069	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32070	//
32071	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32072	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32073	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
32074	// This lets us transform our vselect to:
32075	// (add (xor X, M), (and M, 1))
32076	// And further to:
32077	// (sub (xor X, M), M)
32078	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32079	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32080	auto IsNegV = [](SDNode *N, SDValue V) {
32081	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32082	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32083	};
32084	SDValue V;
32085	if (IsNegV(Y.getNode(), X))
32086	V = X;
32087	else if (IsNegV(X.getNode(), Y))
32088	V = Y;
32089
32090	if (V) {
32091	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32092	SDValue SubOp2 = Mask;
32093
32094	// If the negate was on the false side of the select, then
32095	// the operands of the SUB need to be swapped. PR 27251.
32096	// This is because the pattern being matched above is
32097	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
32098	// but if the pattern matched was
32099	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
32100	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
32101	// pattern also needs to be a negation of the replacement pattern above.
32102	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32103	// sub accomplishes the negation of the replacement pattern.
32104	if (V == Y)
32105	std::swap(SubOp1, SubOp2);
32106
32107	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32108	return DAG.getBitcast(VT, Res);
32109	}
32110	}
32111
32112	// PBLENDVB is only available on SSE 4.1.
32113	if (!Subtarget.hasSSE41())
32114	return SDValue();
32115
32116	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32117
32118	X = DAG.getBitcast(BlendVT, X);
32119	Y = DAG.getBitcast(BlendVT, Y);
32120	Mask = DAG.getBitcast(BlendVT, Mask);
32121	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32122	return DAG.getBitcast(VT, Mask);
32123	}
32124
32125	// Helper function for combineOrCmpEqZeroToCtlzSrl
32126	// Transforms:
32127	// seteq(cmp x, 0)
32128	// into:
32129	// srl(ctlz x), log2(bitsize(x))
32130	// Input pattern is checked by caller.
32131	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32132	SelectionDAG &DAG) {
32133	SDValue Cmp = Op.getOperand(1);
32134	EVT VT = Cmp.getOperand(0).getValueType();
32135	unsigned Log2b = Log2_32(VT.getSizeInBits());
32136	SDLoc dl(Op);
32137	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32138	// The result of the shift is true or false, and on X86, the 32-bit
32139	// encoding of shr and lzcnt is more desirable.
32140	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32141	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32142	DAG.getConstant(Log2b, dl, VT));
32143	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32144	}
32145
32146	// Try to transform:
32147	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32148	// into:
32149	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32150	// Will also attempt to match more generic cases, eg:
32151	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32152	// Only applies if the target supports the FastLZCNT feature.
32153	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32154	TargetLowering::DAGCombinerInfo &DCI,
32155	const X86Subtarget &Subtarget) {
32156	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
32157	return SDValue();
32158
32159	auto isORCandidate = [](SDValue N) {
32160	return (N->getOpcode() == ISD::OR && N->hasOneUse());
32161	};
32162
32163	// Check the zero extend is extending to 32-bit or more. The code generated by
32164	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
32165	// instructions to clear the upper bits.
32166	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
32167	!isORCandidate(N->getOperand(0)))
32168	return SDValue();
32169
32170	// Check the node matches: setcc(eq, cmp 0)
32171	auto isSetCCCandidate = [](SDValue N) {
32172	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32173	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32174	N->getOperand(1).getOpcode() == X86ISD::CMP &&
32175	isNullConstant(N->getOperand(1).getOperand(1)) &&
32176	N->getOperand(1).getValueType().bitsGE(MVT::i32);
32177	};
32178
32179	SDNode *OR = N->getOperand(0).getNode();
32180	SDValue LHS = OR->getOperand(0);
32181	SDValue RHS = OR->getOperand(1);
32182
32183	// Save nodes matching or(or, setcc(eq, cmp 0)).
32184	SmallVector<SDNode *, 2> ORNodes;
32185	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
32186	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32187	ORNodes.push_back(OR);
32188	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32189	LHS = OR->getOperand(0);
32190	RHS = OR->getOperand(1);
32191	}
32192
32193	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32194	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
32195	!isORCandidate(SDValue(OR, 0)))
32196	return SDValue();
32197
32198	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32199	// to
32200	// or(srl(ctlz),srl(ctlz)).
32201	// The dag combiner can then fold it into:
32202	// srl(or(ctlz, ctlz)).
32203	EVT VT = OR->getValueType(0);
32204	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32205	SDValue Ret, NewRHS;
32206	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32207	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32208
32209	if (!Ret)
32210	return SDValue();
32211
32212	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32213	while (ORNodes.size() > 0) {
32214	OR = ORNodes.pop_back_val();
32215	LHS = OR->getOperand(0);
32216	RHS = OR->getOperand(1);
32217	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32218	if (RHS->getOpcode() == ISD::OR)
32219	std::swap(LHS, RHS);
32220	EVT VT = OR->getValueType(0);
32221	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32222	if (!NewRHS)
32223	return SDValue();
32224	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32225	}
32226
32227	if (Ret)
32228	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32229
32230	return Ret;
32231	}
32232
32233	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32234	TargetLowering::DAGCombinerInfo &DCI,
32235	const X86Subtarget &Subtarget) {
32236	if (DCI.isBeforeLegalizeOps())
32237	return SDValue();
32238
32239	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32240	return R;
32241
32242	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32243	return FPLogic;
32244
32245	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32246	return R;
32247
32248	SDValue N0 = N->getOperand(0);
32249	SDValue N1 = N->getOperand(1);
32250	EVT VT = N->getValueType(0);
32251
32252	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32253	return SDValue();
32254
32255	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
32256	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32257
32258	// SHLD/SHRD instructions have lower register pressure, but on some
32259	// platforms they have higher latency than the equivalent
32260	// series of shifts/or that would otherwise be generated.
32261	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
32262	// have higher latencies and we are not optimizing for size.
32263	if (!OptForSize && Subtarget.isSHLDSlow())
32264	return SDValue();
32265
32266	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32267	std::swap(N0, N1);
32268	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
32269	return SDValue();
32270	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
32271	return SDValue();
32272
32273	SDValue ShAmt0 = N0.getOperand(1);
32274	if (ShAmt0.getValueType() != MVT::i8)
32275	return SDValue();
32276	SDValue ShAmt1 = N1.getOperand(1);
32277	if (ShAmt1.getValueType() != MVT::i8)
32278	return SDValue();
32279	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32280	ShAmt0 = ShAmt0.getOperand(0);
32281	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32282	ShAmt1 = ShAmt1.getOperand(0);
32283
32284	SDLoc DL(N);
32285	unsigned Opc = X86ISD::SHLD;
32286	SDValue Op0 = N0.getOperand(0);
32287	SDValue Op1 = N1.getOperand(0);
32288	if (ShAmt0.getOpcode() == ISD::SUB \|\|
32289	ShAmt0.getOpcode() == ISD::XOR) {
32290	Opc = X86ISD::SHRD;
32291	std::swap(Op0, Op1);
32292	std::swap(ShAmt0, ShAmt1);
32293	}
32294
32295	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32296	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32297	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32298	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32299	unsigned Bits = VT.getSizeInBits();
32300	if (ShAmt1.getOpcode() == ISD::SUB) {
32301	SDValue Sum = ShAmt1.getOperand(0);
32302	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32303	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32304	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32305	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32306	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32307	return DAG.getNode(Opc, DL, VT,
32308	Op0, Op1,
32309	DAG.getNode(ISD::TRUNCATE, DL,
32310	MVT::i8, ShAmt0));
32311	}
32312	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32313	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32314	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32315	return DAG.getNode(Opc, DL, VT,
32316	N0.getOperand(0), N1.getOperand(0),
32317	DAG.getNode(ISD::TRUNCATE, DL,
32318	MVT::i8, ShAmt0));
32319	} else if (ShAmt1.getOpcode() == ISD::XOR) {
32320	SDValue Mask = ShAmt1.getOperand(1);
32321	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32322	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32323	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32324	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32325	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32326	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32327	if (Op1.getOpcode() == InnerShift &&
32328	isa<ConstantSDNode>(Op1.getOperand(1)) &&
32329	Op1.getConstantOperandVal(1) == 1) {
32330	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32331	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32332	}
32333	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32334	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32335	Op1.getOperand(0) == Op1.getOperand(1)) {
32336	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32337	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32338	}
32339	}
32340	}
32341	}
32342
32343	return SDValue();
32344	}
32345
32346	/// Generate NEG and CMOV for integer abs.
32347	static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32348	EVT VT = N->getValueType(0);
32349
32350	// Since X86 does not have CMOV for 8-bit integer, we don't convert
32351	// 8-bit integer abs to NEG and CMOV.
32352	if (VT.isInteger() && VT.getSizeInBits() == 8)
32353	return SDValue();
32354
32355	SDValue N0 = N->getOperand(0);
32356	SDValue N1 = N->getOperand(1);
32357	SDLoc DL(N);
32358
32359	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32360	// and change it to SUB and CMOV.
32361	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32362	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32363	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32364	auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32365	if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32366	// Generate SUB & CMOV.
32367	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32368	DAG.getConstant(0, DL, VT), N0.getOperand(0));
32369	SDValue Ops[] = {N0.getOperand(0), Neg,
32370	DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32371	SDValue(Neg.getNode(), 1)};
32372	return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32373	}
32374	}
32375	return SDValue();
32376	}
32377
32378	/// Try to turn tests against the signbit in the form of:
32379	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32380	/// into:
32381	/// SETGT(X, -1)
32382	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32383	// This is only worth doing if the output type is i8 or i1.
32384	EVT ResultType = N->getValueType(0);
32385	if (ResultType != MVT::i8 && ResultType != MVT::i1)
32386	return SDValue();
32387
32388	SDValue N0 = N->getOperand(0);
32389	SDValue N1 = N->getOperand(1);
32390
32391	// We should be performing an xor against a truncated shift.
32392	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
32393	return SDValue();
32394
32395	// Make sure we are performing an xor against one.
32396	if (!isOneConstant(N1))
32397	return SDValue();
32398
32399	// SetCC on x86 zero extends so only act on this if it's a logical shift.
32400	SDValue Shift = N0.getOperand(0);
32401	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
32402	return SDValue();
32403
32404	// Make sure we are truncating from one of i16, i32 or i64.
32405	EVT ShiftTy = Shift.getValueType();
32406	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32407	return SDValue();
32408
32409	// Make sure the shift amount extracts the sign bit.
32410	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
32411	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32412	return SDValue();
32413
32414	// Create a greater-than comparison against -1.
32415	// N.B. Using SETGE against 0 works but we want a canonical looking
32416	// comparison, using SETGT matches up with what TranslateX86CC.
32417	SDLoc DL(N);
32418	SDValue ShiftOp = Shift.getOperand(0);
32419	EVT ShiftOpTy = ShiftOp.getValueType();
32420	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32421	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32422	*DAG.getContext(), ResultType);
32423	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32424	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32425	if (SetCCResultType != ResultType)
32426	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32427	return Cond;
32428	}
32429
32430	/// Turn vector tests of the signbit in the form of:
32431	/// xor (sra X, elt_size(X)-1), -1
32432	/// into:
32433	/// pcmpgt X, -1
32434	///
32435	/// This should be called before type legalization because the pattern may not
32436	/// persist after that.
32437	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32438	const X86Subtarget &Subtarget) {
32439	EVT VT = N->getValueType(0);
32440	if (!VT.isSimple())
32441	return SDValue();
32442
32443	switch (VT.getSimpleVT().SimpleTy) {
32444	default: return SDValue();
32445	case MVT::v16i8:
32446	case MVT::v8i16:
32447	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32448	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32449	case MVT::v32i8:
32450	case MVT::v16i16:
32451	case MVT::v8i32:
32452	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32453	}
32454
32455	// There must be a shift right algebraic before the xor, and the xor must be a
32456	// 'not' operation.
32457	SDValue Shift = N->getOperand(0);
32458	SDValue Ones = N->getOperand(1);
32459	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
32460	!ISD::isBuildVectorAllOnes(Ones.getNode()))
32461	return SDValue();
32462
32463	// The shift should be smearing the sign bit across each vector element.
32464	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32465	if (!ShiftBV)
32466	return SDValue();
32467
32468	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32469	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32470	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32471	return SDValue();
32472
32473	// Create a greater-than comparison against -1. We don't use the more obvious
32474	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32475	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32476	}
32477
32478	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
32479	/// is valid for the given \p Subtarget.
32480	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32481	const X86Subtarget &Subtarget) {
32482	if (!Subtarget.hasAVX512())
32483	return false;
32484
32485	// FIXME: Scalar type may be supported if we move it to vector register.
32486	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
32487	return false;
32488
32489	EVT SrcElVT = SrcVT.getScalarType();
32490	EVT DstElVT = DstVT.getScalarType();
32491	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
32492	return false;
32493	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
32494	return false;
32495	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
32496	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
32497	return false;
32498	}
32499
32500	/// Detect a pattern of truncation with saturation:
32501	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32502	/// Return the source value to be truncated or SDValue() if the pattern was not
32503	/// matched.
32504	static SDValue detectUSatPattern(SDValue In, EVT VT) {
32505	if (In.getOpcode() != ISD::UMIN)
32506	return SDValue();
32507
32508	//Saturation with truncation. We truncate from InVT to VT.
32509	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&((In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation") ? static_cast <void> (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32510, __PRETTY_FUNCTION__))
32510	"Unexpected types for truncate operation")((In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation") ? static_cast <void> (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32510, __PRETTY_FUNCTION__));
32511
32512	APInt C;
32513	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32514	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32515	// the element size of the destination type.
32516	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32517	SDValue();
32518	}
32519	return SDValue();
32520	}
32521
32522	/// Detect a pattern of truncation with saturation:
32523	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32524	/// The types should allow to use VPMOVUS* instruction on AVX512.
32525	/// Return the source value to be truncated or SDValue() if the pattern was not
32526	/// matched.
32527	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32528	const X86Subtarget &Subtarget) {
32529	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32530	return SDValue();
32531	return detectUSatPattern(In, VT);
32532	}
32533
32534	static SDValue
32535	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32536	const X86Subtarget &Subtarget) {
32537	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32538	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
32539	return SDValue();
32540	if (auto USatVal = detectUSatPattern(In, VT))
32541	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32542	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32543	return SDValue();
32544	}
32545
32546	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
32547	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32548	/// X86ISD::AVG instruction.
32549	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32550	const X86Subtarget &Subtarget,
32551	const SDLoc &DL) {
32552	if (!VT.isVector() \|\| !VT.isSimple())
32553	return SDValue();
32554	EVT InVT = In.getValueType();
32555	unsigned NumElems = VT.getVectorNumElements();
32556
32557	EVT ScalarVT = VT.getVectorElementType();
32558	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
32559	isPowerOf2_32(NumElems)))
32560	return SDValue();
32561
32562	// InScalarVT is the intermediate type in AVG pattern and it should be greater
32563	// than the original input type (i8/i16).
32564	EVT InScalarVT = InVT.getVectorElementType();
32565	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32566	return SDValue();
32567
32568	if (!Subtarget.hasSSE2())
32569	return SDValue();
32570	if (Subtarget.hasBWI()) {
32571	if (VT.getSizeInBits() > 512)
32572	return SDValue();
32573	} else if (Subtarget.hasAVX2()) {
32574	if (VT.getSizeInBits() > 256)
32575	return SDValue();
32576	} else {
32577	if (VT.getSizeInBits() > 128)
32578	return SDValue();
32579	}
32580
32581	// Detect the following pattern:
32582	//
32583	// %1 = zext <N x i8> %a to <N x i32>
32584	// %2 = zext <N x i8> %b to <N x i32>
32585	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32586	// %4 = add nuw nsw <N x i32> %3, %2
32587	// %5 = lshr <N x i32> %N, <i32 1 x N>
32588	// %6 = trunc <N x i32> %5 to <N x i8>
32589	//
32590	// In AVX512, the last instruction can also be a trunc store.
32591
32592	if (In.getOpcode() != ISD::SRL)
32593	return SDValue();
32594
32595	// A lambda checking the given SDValue is a constant vector and each element
32596	// is in the range [Min, Max].
32597	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32598	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32599	if (!BV \|\| !BV->isConstant())
32600	return false;
32601	for (SDValue Op : V->ops()) {
32602	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32603	if (!C)
32604	return false;
32605	uint64_t Val = C->getZExtValue();
32606	if (Val < Min \|\| Val > Max)
32607	return false;
32608	}
32609	return true;
32610	};
32611
32612	// Check if each element of the vector is left-shifted by one.
32613	auto LHS = In.getOperand(0);
32614	auto RHS = In.getOperand(1);
32615	if (!IsConstVectorInRange(RHS, 1, 1))
32616	return SDValue();
32617	if (LHS.getOpcode() != ISD::ADD)
32618	return SDValue();
32619
32620	// Detect a pattern of a + b + 1 where the order doesn't matter.
32621	SDValue Operands[3];
32622	Operands[0] = LHS.getOperand(0);
32623	Operands[1] = LHS.getOperand(1);
32624
32625	// Take care of the case when one of the operands is a constant vector whose
32626	// element is in the range [1, 256].
32627	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32628	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32629	Operands[0].getOperand(0).getValueType() == VT) {
32630	// The pattern is detected. Subtract one from the constant vector, then
32631	// demote it and emit X86ISD::AVG instruction.
32632	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32633	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32634	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32635	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32636	Operands[1]);
32637	}
32638
32639	if (Operands[0].getOpcode() == ISD::ADD)
32640	std::swap(Operands[0], Operands[1]);
32641	else if (Operands[1].getOpcode() != ISD::ADD)
32642	return SDValue();
32643	Operands[2] = Operands[1].getOperand(0);
32644	Operands[1] = Operands[1].getOperand(1);
32645
32646	// Now we have three operands of two additions. Check that one of them is a
32647	// constant vector with ones, and the other two are promoted from i8/i16.
32648	for (int i = 0; i < 3; ++i) {
32649	if (!IsConstVectorInRange(Operands[i], 1, 1))
32650	continue;
32651	std::swap(Operands[i], Operands[2]);
32652
32653	// Check if Operands[0] and Operands[1] are results of type promotion.
32654	for (int j = 0; j < 2; ++j)
32655	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
32656	Operands[j].getOperand(0).getValueType() != VT)
32657	return SDValue();
32658
32659	// The pattern is detected, emit X86ISD::AVG instruction.
32660	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32661	Operands[1].getOperand(0));
32662	}
32663
32664	return SDValue();
32665	}
32666
32667	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32668	TargetLowering::DAGCombinerInfo &DCI,
32669	const X86Subtarget &Subtarget) {
32670	LoadSDNode *Ld = cast<LoadSDNode>(N);
32671	EVT RegVT = Ld->getValueType(0);
32672	EVT MemVT = Ld->getMemoryVT();
32673	SDLoc dl(Ld);
32674	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32675
32676	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
32677	// into two 16-byte operations. Also split non-temporal aligned loads on
32678	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32679	ISD::LoadExtType Ext = Ld->getExtensionType();
32680	bool Fast;
32681	unsigned AddressSpace = Ld->getAddressSpace();
32682	unsigned Alignment = Ld->getAlignment();
32683	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32684	Ext == ISD::NON_EXTLOAD &&
32685	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
32686	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32687	AddressSpace, Alignment, &Fast) && !Fast))) {
32688	unsigned NumElems = RegVT.getVectorNumElements();
32689	if (NumElems < 2)
32690	return SDValue();
32691
32692	SDValue Ptr = Ld->getBasePtr();
32693
32694	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32695	NumElems/2);
32696	SDValue Load1 =
32697	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32698	Alignment, Ld->getMemOperand()->getFlags());
32699
32700	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32701	SDValue Load2 =
32702	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32703	std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32704	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32705	Load1.getValue(1),
32706	Load2.getValue(1));
32707
32708	SDValue NewVec = DAG.getUNDEF(RegVT);
32709	NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32710	NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32711	return DCI.CombineTo(N, NewVec, TF, true);
32712	}
32713
32714	return SDValue();
32715	}
32716
32717	/// If V is a build vector of boolean constants and exactly one of those
32718	/// constants is true, return the operand index of that true element.
32719	/// Otherwise, return -1.
32720	static int getOneTrueElt(SDValue V) {
32721	// This needs to be a build vector of booleans.
32722	// TODO: Checking for the i1 type matches the IR definition for the mask,
32723	// but the mask check could be loosened to i8 or other types. That might
32724	// also require checking more than 'allOnesValue'; eg, the x86 HW
32725	// instructions only require that the MSB is set for each mask element.
32726	// The ISD::MSTORE comments/definition do not specify how the mask operand
32727	// is formatted.
32728	auto *BV = dyn_cast<BuildVectorSDNode>(V);
32729	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
32730	return -1;
32731
32732	int TrueIndex = -1;
32733	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32734	for (unsigned i = 0; i < NumElts; ++i) {
32735	const SDValue &Op = BV->getOperand(i);
32736	if (Op.isUndef())
32737	continue;
32738	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32739	if (!ConstNode)
32740	return -1;
32741	if (ConstNode->getAPIntValue().isAllOnesValue()) {
32742	// If we already found a one, this is too many.
32743	if (TrueIndex >= 0)
32744	return -1;
32745	TrueIndex = i;
32746	}
32747	}
32748	return TrueIndex;
32749	}
32750
32751	/// Given a masked memory load/store operation, return true if it has one mask
32752	/// bit set. If it has one mask bit set, then also return the memory address of
32753	/// the scalar element to load/store, the vector index to insert/extract that
32754	/// scalar element, and the alignment for the scalar memory access.
32755	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32756	SelectionDAG &DAG, SDValue &Addr,
32757	SDValue &Index, unsigned &Alignment) {
32758	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32759	if (TrueMaskElt < 0)
32760	return false;
32761
32762	// Get the address of the one scalar element that is specified by the mask
32763	// using the appropriate offset from the base pointer.
32764	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32765	Addr = MaskedOp->getBasePtr();
32766	if (TrueMaskElt != 0) {
32767	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32768	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32769	}
32770
32771	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32772	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32773	return true;
32774	}
32775
32776	/// If exactly one element of the mask is set for a non-extending masked load,
32777	/// it is a scalar load and vector insert.
32778	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32779	/// mask have already been optimized in IR, so we don't bother with those here.
32780	static SDValue
32781	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32782	TargetLowering::DAGCombinerInfo &DCI) {
32783	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32784	// However, some target hooks may need to be added to know when the transform
32785	// is profitable. Endianness would also have to be considered.
32786
32787	SDValue Addr, VecIndex;
32788	unsigned Alignment;
32789	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32790	return SDValue();
32791
32792	// Load the one scalar element that is specified by the mask using the
32793	// appropriate offset from the base pointer.
32794	SDLoc DL(ML);
32795	EVT VT = ML->getValueType(0);
32796	EVT EltVT = VT.getVectorElementType();
32797	SDValue Load =
32798	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32799	Alignment, ML->getMemOperand()->getFlags());
32800
32801	// Insert the loaded element into the appropriate place in the vector.
32802	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32803	Load, VecIndex);
32804	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32805	}
32806
32807	static SDValue
32808	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32809	TargetLowering::DAGCombinerInfo &DCI) {
32810	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32811	return SDValue();
32812
32813	SDLoc DL(ML);
32814	EVT VT = ML->getValueType(0);
32815
32816	// If we are loading the first and last elements of a vector, it is safe and
32817	// always faster to load the whole vector. Replace the masked load with a
32818	// vector load and select.
32819	unsigned NumElts = VT.getVectorNumElements();
32820	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32821	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32822	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32823	if (LoadFirstElt && LoadLastElt) {
32824	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32825	ML->getMemOperand());
32826	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32827	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32828	}
32829
32830	// Convert a masked load with a constant mask into a masked load and a select.
32831	// This allows the select operation to use a faster kind of select instruction
32832	// (for example, vblendvps -> vblendps).
32833
32834	// Don't try this if the pass-through operand is already undefined. That would
32835	// cause an infinite loop because that's what we're about to create.
32836	if (ML->getSrc0().isUndef())
32837	return SDValue();
32838
32839	// The new masked load has an undef pass-through operand. The select uses the
32840	// original pass-through operand.
32841	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32842	ML->getMask(), DAG.getUNDEF(VT),
32843	ML->getMemoryVT(), ML->getMemOperand(),
32844	ML->getExtensionType());
32845	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32846
32847	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32848	}
32849
32850	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32851	TargetLowering::DAGCombinerInfo &DCI,
32852	const X86Subtarget &Subtarget) {
32853	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32854
32855	// TODO: Expanding load with constant mask may be optimized as well.
32856	if (Mld->isExpandingLoad())
32857	return SDValue();
32858
32859	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32860	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32861	return ScalarLoad;
32862	// TODO: Do some AVX512 subsets benefit from this transform?
32863	if (!Subtarget.hasAVX512())
32864	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32865	return Blend;
32866	}
32867
32868	if (Mld->getExtensionType() != ISD::SEXTLOAD)
32869	return SDValue();
32870
32871	// Resolve extending loads.
32872	EVT VT = Mld->getValueType(0);
32873	unsigned NumElems = VT.getVectorNumElements();
32874	EVT LdVT = Mld->getMemoryVT();
32875	SDLoc dl(Mld);
32876
32877	assert(LdVT != VT && "Cannot extend to the same type")((LdVT != VT && "Cannot extend to the same type") ? static_cast <void> (0) : __assert_fail ("LdVT != VT && \"Cannot extend to the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32877, __PRETTY_FUNCTION__));
32878	unsigned ToSz = VT.getScalarSizeInBits();
32879	unsigned FromSz = LdVT.getScalarSizeInBits();
32880	// From/To sizes and ElemCount must be pow of two.
32881	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32882, __PRETTY_FUNCTION__))
32882	"Unexpected size for extending masked load")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32882, __PRETTY_FUNCTION__));
32883
32884	unsigned SizeRatio = ToSz / FromSz;
32885	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits())((SizeRatio * NumElems * FromSz == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("SizeRatio * NumElems * FromSz == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32885, __PRETTY_FUNCTION__));
32886
32887	// Create a type on which we perform the shuffle.
32888	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32889	LdVT.getScalarType(), NumElems*SizeRatio);
32890	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32890, __PRETTY_FUNCTION__));
32891
32892	// Convert Src0 value.
32893	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32894	if (!Mld->getSrc0().isUndef()) {
32895	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32896	for (unsigned i = 0; i != NumElems; ++i)
32897	ShuffleVec[i] = i * SizeRatio;
32898
32899	// Can't shuffle using an illegal type.
32900	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32901, __PRETTY_FUNCTION__))
32901	"WideVecVT should be legal")((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32901, __PRETTY_FUNCTION__));
32902	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32903	DAG.getUNDEF(WideVecVT), ShuffleVec);
32904	}
32905	// Prepare the new mask.
32906	SDValue NewMask;
32907	SDValue Mask = Mld->getMask();
32908	if (Mask.getValueType() == VT) {
32909	// Mask and original value have the same type.
32910	NewMask = DAG.getBitcast(WideVecVT, Mask);
32911	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32912	for (unsigned i = 0; i != NumElems; ++i)
32913	ShuffleVec[i] = i * SizeRatio;
32914	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32915	ShuffleVec[i] = NumElems * SizeRatio;
32916	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32917	DAG.getConstant(0, dl, WideVecVT),
32918	ShuffleVec);
32919	} else {
32920	assert(Mask.getValueType().getVectorElementType() == MVT::i1)((Mask.getValueType().getVectorElementType() == MVT::i1) ? static_cast <void> (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32920, __PRETTY_FUNCTION__));
32921	unsigned WidenNumElts = NumElems*SizeRatio;
32922	unsigned MaskNumElts = VT.getVectorNumElements();
32923	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32924	WidenNumElts);
32925
32926	unsigned NumConcat = WidenNumElts / MaskNumElts;
32927	SmallVector<SDValue, 16> Ops(NumConcat);
32928	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32929	Ops[0] = Mask;
32930	for (unsigned i = 1; i != NumConcat; ++i)
32931	Ops[i] = ZeroVal;
32932
32933	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32934	}
32935
32936	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32937	Mld->getBasePtr(), NewMask, WideSrc0,
32938	Mld->getMemoryVT(), Mld->getMemOperand(),
32939	ISD::NON_EXTLOAD);
32940	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32941	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32942	}
32943
32944	/// If exactly one element of the mask is set for a non-truncating masked store,
32945	/// it is a vector extract and scalar store.
32946	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32947	/// mask have already been optimized in IR, so we don't bother with those here.
32948	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32949	SelectionDAG &DAG) {
32950	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32951	// However, some target hooks may need to be added to know when the transform
32952	// is profitable. Endianness would also have to be considered.
32953
32954	SDValue Addr, VecIndex;
32955	unsigned Alignment;
32956	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32957	return SDValue();
32958
32959	// Extract the one scalar element that is actually being stored.
32960	SDLoc DL(MS);
32961	EVT VT = MS->getValue().getValueType();
32962	EVT EltVT = VT.getVectorElementType();
32963	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32964	MS->getValue(), VecIndex);
32965
32966	// Store that element at the appropriate offset from the base pointer.
32967	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32968	Alignment, MS->getMemOperand()->getFlags());
32969	}
32970
32971	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32972	const X86Subtarget &Subtarget) {
32973	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32974
32975	if (Mst->isCompressingStore())
32976	return SDValue();
32977
32978	if (!Mst->isTruncatingStore())
32979	return reduceMaskedStoreToScalarStore(Mst, DAG);
32980
32981	// Resolve truncating stores.
32982	EVT VT = Mst->getValue().getValueType();
32983	unsigned NumElems = VT.getVectorNumElements();
32984	EVT StVT = Mst->getMemoryVT();
32985	SDLoc dl(Mst);
32986
32987	assert(StVT != VT && "Cannot truncate to the same type")((StVT != VT && "Cannot truncate to the same type") ? static_cast<void> (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 32987, __PRETTY_FUNCTION__));
32988	unsigned FromSz = VT.getScalarSizeInBits();
32989	unsigned ToSz = StVT.getScalarSizeInBits();
32990
32991	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32992
32993	// The truncating store is legal in some cases. For example
32994	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32995	// are designated for truncate store.
32996	// In this case we don't need any further transformations.
32997	if (TLI.isTruncStoreLegal(VT, StVT))
32998	return SDValue();
32999
33000	// From/To sizes and ElemCount must be pow of two.
33001	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33002, __PRETTY_FUNCTION__))
33002	"Unexpected size for truncating masked store")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33002, __PRETTY_FUNCTION__));
33003	// We are going to use the original vector elt for storing.
33004	// Accumulated smaller vector elements must be a multiple of the store size.
33005	assert (((NumElems * FromSz) % ToSz) == 0 &&((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store" ) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33006, __PRETTY_FUNCTION__))
33006	"Unexpected ratio for truncating masked store")((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store" ) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33006, __PRETTY_FUNCTION__));
33007
33008	unsigned SizeRatio = FromSz / ToSz;
33009	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())((SizeRatio * NumElems * ToSz == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33009, __PRETTY_FUNCTION__));
33010
33011	// Create a type on which we perform the shuffle.
33012	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33013	StVT.getScalarType(), NumElems*SizeRatio);
33014
33015	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33015, __PRETTY_FUNCTION__));
33016
33017	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33018	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33019	for (unsigned i = 0; i != NumElems; ++i)
33020	ShuffleVec[i] = i * SizeRatio;
33021
33022	// Can't shuffle using an illegal type.
33023	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33024, __PRETTY_FUNCTION__))
33024	"WideVecVT should be legal")((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal") ? static_cast<void> (0) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33024, __PRETTY_FUNCTION__));
33025
33026	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33027	DAG.getUNDEF(WideVecVT),
33028	ShuffleVec);
33029
33030	SDValue NewMask;
33031	SDValue Mask = Mst->getMask();
33032	if (Mask.getValueType() == VT) {
33033	// Mask and original value have the same type.
33034	NewMask = DAG.getBitcast(WideVecVT, Mask);
33035	for (unsigned i = 0; i != NumElems; ++i)
33036	ShuffleVec[i] = i * SizeRatio;
33037	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33038	ShuffleVec[i] = NumElems*SizeRatio;
33039	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33040	DAG.getConstant(0, dl, WideVecVT),
33041	ShuffleVec);
33042	} else {
33043	assert(Mask.getValueType().getVectorElementType() == MVT::i1)((Mask.getValueType().getVectorElementType() == MVT::i1) ? static_cast <void> (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33043, __PRETTY_FUNCTION__));
33044	unsigned WidenNumElts = NumElems*SizeRatio;
33045	unsigned MaskNumElts = VT.getVectorNumElements();
33046	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33047	WidenNumElts);
33048
33049	unsigned NumConcat = WidenNumElts / MaskNumElts;
33050	SmallVector<SDValue, 16> Ops(NumConcat);
33051	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33052	Ops[0] = Mask;
33053	for (unsigned i = 1; i != NumConcat; ++i)
33054	Ops[i] = ZeroVal;
33055
33056	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33057	}
33058
33059	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33060	Mst->getBasePtr(), NewMask, StVT,
33061	Mst->getMemOperand(), false);
33062	}
33063
33064	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33065	const X86Subtarget &Subtarget) {
33066	StoreSDNode *St = cast<StoreSDNode>(N);
33067	EVT VT = St->getValue().getValueType();
33068	EVT StVT = St->getMemoryVT();
33069	SDLoc dl(St);
33070	SDValue StoredVal = St->getOperand(1);
33071	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33072
33073	// If we are saving a concatenation of two XMM registers and 32-byte stores
33074	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
33075	bool Fast;
33076	unsigned AddressSpace = St->getAddressSpace();
33077	unsigned Alignment = St->getAlignment();
33078	if (VT.is256BitVector() && StVT == VT &&
33079	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33080	AddressSpace, Alignment, &Fast) &&
33081	!Fast) {
33082	unsigned NumElems = VT.getVectorNumElements();
33083	if (NumElems < 2)
33084	return SDValue();
33085
33086	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33087	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33088
33089	SDValue Ptr0 = St->getBasePtr();
33090	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33091
33092	SDValue Ch0 =
33093	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33094	Alignment, St->getMemOperand()->getFlags());
33095	SDValue Ch1 =
33096	DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33097	std::min(16U, Alignment), St->getMemOperand()->getFlags());
33098	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33099	}
33100
33101	// Optimize trunc store (of multiple scalars) to shuffle and store.
33102	// First, pack all of the elements in one place. Next, store to memory
33103	// in fewer chunks.
33104	if (St->isTruncatingStore() && VT.isVector()) {
33105	// Check if we can detect an AVG pattern from the truncation. If yes,
33106	// replace the trunc store by a normal store with the result of X86ISD::AVG
33107	// instruction.
33108	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33109	Subtarget, dl))
33110	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33111	St->getPointerInfo(), St->getAlignment(),
33112	St->getMemOperand()->getFlags());
33113
33114	if (SDValue Val =
33115	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33116	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33117	dl, Val, St->getBasePtr(),
33118	St->getMemoryVT(), St->getMemOperand(), DAG);
33119
33120	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33121	unsigned NumElems = VT.getVectorNumElements();
33122	assert(StVT != VT && "Cannot truncate to the same type")((StVT != VT && "Cannot truncate to the same type") ? static_cast<void> (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33122, __PRETTY_FUNCTION__));
33123	unsigned FromSz = VT.getScalarSizeInBits();
33124	unsigned ToSz = StVT.getScalarSizeInBits();
33125
33126	// The truncating store is legal in some cases. For example
33127	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33128	// are designated for truncate store.
33129	// In this case we don't need any further transformations.
33130	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33131	return SDValue();
33132
33133	// From, To sizes and ElemCount must be pow of two
33134	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33135	// We are going to use the original vector elt for storing.
33136	// Accumulated smaller vector elements must be a multiple of the store size.
33137	if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33138
33139	unsigned SizeRatio = FromSz / ToSz;
33140
33141	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())((SizeRatio * NumElems * ToSz == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33141, __PRETTY_FUNCTION__));
33142
33143	// Create a type on which we perform the shuffle
33144	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33145	StVT.getScalarType(), NumElems*SizeRatio);
33146
33147	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33147, __PRETTY_FUNCTION__));
33148
33149	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33150	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33151	for (unsigned i = 0; i != NumElems; ++i)
33152	ShuffleVec[i] = i * SizeRatio;
33153
33154	// Can't shuffle using an illegal type.
33155	if (!TLI.isTypeLegal(WideVecVT))
33156	return SDValue();
33157
33158	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33159	DAG.getUNDEF(WideVecVT),
33160	ShuffleVec);
33161	// At this point all of the data is stored at the bottom of the
33162	// register. We now need to save it to mem.
33163
33164	// Find the largest store unit
33165	MVT StoreType = MVT::i8;
33166	for (MVT Tp : MVT::integer_valuetypes()) {
33167	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33168	StoreType = Tp;
33169	}
33170
33171	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33172	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33173	(64 <= NumElems * ToSz))
33174	StoreType = MVT::f64;
33175
33176	// Bitcast the original vector into a vector of store-size units
33177	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33178	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33179	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits())((StoreVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast <void> (0) : __assert_fail ("StoreVecVT.getSizeInBits() == VT.getSizeInBits()" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33179, __PRETTY_FUNCTION__));
33180	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33181	SmallVector<SDValue, 8> Chains;
33182	SDValue Ptr = St->getBasePtr();
33183
33184	// Perform one or more big stores into memory.
33185	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33186	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33187	StoreType, ShuffWide,
33188	DAG.getIntPtrConstant(i, dl));
33189	SDValue Ch =
33190	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33191	St->getAlignment(), St->getMemOperand()->getFlags());
33192	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33193	Chains.push_back(Ch);
33194	}
33195
33196	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33197	}
33198
33199	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
33200	// the FP state in cases where an emms may be missing.
33201	// A preferable solution to the general problem is to figure out the right
33202	// places to insert EMMS. This qualifies as a quick hack.
33203
33204	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33205	if (VT.getSizeInBits() != 64)
33206	return SDValue();
33207
33208	const Function *F = DAG.getMachineFunction().getFunction();
33209	bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33210	bool F64IsLegal =
33211	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33212	if ((VT.isVector() \|\|
33213	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33214	isa<LoadSDNode>(St->getValue()) &&
33215	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
33216	St->getChain().hasOneUse() && !St->isVolatile()) {
33217	SDNode* LdVal = St->getValue().getNode();
33218	LoadSDNode *Ld = nullptr;
33219	int TokenFactorIndex = -1;
33220	SmallVector<SDValue, 8> Ops;
33221	SDNode* ChainVal = St->getChain().getNode();
33222	// Must be a store of a load. We currently handle two cases: the load
33223	// is a direct child, and it's under an intervening TokenFactor. It is
33224	// possible to dig deeper under nested TokenFactors.
33225	if (ChainVal == LdVal)
33226	Ld = cast<LoadSDNode>(St->getChain());
33227	else if (St->getValue().hasOneUse() &&
33228	ChainVal->getOpcode() == ISD::TokenFactor) {
33229	for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33230	if (ChainVal->getOperand(i).getNode() == LdVal) {
33231	TokenFactorIndex = i;
33232	Ld = cast<LoadSDNode>(St->getValue());
33233	} else
33234	Ops.push_back(ChainVal->getOperand(i));
33235	}
33236	}
33237
33238	if (!Ld \|\| !ISD::isNormalLoad(Ld))
33239	return SDValue();
33240
33241	// If this is not the MMX case, i.e. we are just turning i64 load/store
33242	// into f64 load/store, avoid the transformation if there are multiple
33243	// uses of the loaded value.
33244	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33245	return SDValue();
33246
33247	SDLoc LdDL(Ld);
33248	SDLoc StDL(N);
33249	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
33250	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33251	// pair instead.
33252	if (Subtarget.is64Bit() \|\| F64IsLegal) {
33253	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33254	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33255	Ld->getPointerInfo(), Ld->getAlignment(),
33256	Ld->getMemOperand()->getFlags());
33257	SDValue NewChain = NewLd.getValue(1);
33258	if (TokenFactorIndex >= 0) {
33259	Ops.push_back(NewChain);
33260	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33261	}
33262	return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33263	St->getPointerInfo(), St->getAlignment(),
33264	St->getMemOperand()->getFlags());
33265	}
33266
33267	// Otherwise, lower to two pairs of 32-bit loads / stores.
33268	SDValue LoAddr = Ld->getBasePtr();
33269	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33270
33271	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33272	Ld->getPointerInfo(), Ld->getAlignment(),
33273	Ld->getMemOperand()->getFlags());
33274	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33275	Ld->getPointerInfo().getWithOffset(4),
33276	MinAlign(Ld->getAlignment(), 4),
33277	Ld->getMemOperand()->getFlags());
33278
33279	SDValue NewChain = LoLd.getValue(1);
33280	if (TokenFactorIndex >= 0) {
33281	Ops.push_back(LoLd);
33282	Ops.push_back(HiLd);
33283	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33284	}
33285
33286	LoAddr = St->getBasePtr();
33287	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33288
33289	SDValue LoSt =
33290	DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33291	St->getAlignment(), St->getMemOperand()->getFlags());
33292	SDValue HiSt = DAG.getStore(
33293	NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33294	MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33295	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33296	}
33297
33298	// This is similar to the above case, but here we handle a scalar 64-bit
33299	// integer store that is extracted from a vector on a 32-bit target.
33300	// If we have SSE2, then we can treat it like a floating-point double
33301	// to get past legalization. The execution dependencies fixup pass will
33302	// choose the optimal machine instruction for the store if this really is
33303	// an integer or v2f32 rather than an f64.
33304	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33305	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33306	SDValue OldExtract = St->getOperand(1);
33307	SDValue ExtOp0 = OldExtract.getOperand(0);
33308	unsigned VecSize = ExtOp0.getValueSizeInBits();
33309	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33310	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33311	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33312	BitCast, OldExtract.getOperand(1));
33313	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33314	St->getPointerInfo(), St->getAlignment(),
33315	St->getMemOperand()->getFlags());
33316	}
33317
33318	return SDValue();
33319	}
33320
33321	/// Return 'true' if this vector operation is "horizontal"
33322	/// and return the operands for the horizontal operation in LHS and RHS. A
33323	/// horizontal operation performs the binary operation on successive elements
33324	/// of its first operand, then on successive elements of its second operand,
33325	/// returning the resulting values in a vector. For example, if
33326	/// A = < float a0, float a1, float a2, float a3 >
33327	/// and
33328	/// B = < float b0, float b1, float b2, float b3 >
33329	/// then the result of doing a horizontal operation on A and B is
33330	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33331	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33332	/// A horizontal-op B, for some already available A and B, and if so then LHS is
33333	/// set to A, RHS to B, and the routine returns 'true'.
33334	/// Note that the binary operation should have the property that if one of the
33335	/// operands is UNDEF then the result is UNDEF.
33336	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33337	// Look for the following pattern: if
33338	// A = < float a0, float a1, float a2, float a3 >
33339	// B = < float b0, float b1, float b2, float b3 >
33340	// and
33341	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33342	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33343	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33344	// which is A horizontal-op B.
33345
33346	// At least one of the operands should be a vector shuffle.
33347	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33348	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33349	return false;
33350
33351	MVT VT = LHS.getSimpleValueType();
33352
33353	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&(((VT.is128BitVector() \|\| VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33354, __PRETTY_FUNCTION__))
33354	"Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() \|\| VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub" ) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() \|\| VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33354, __PRETTY_FUNCTION__));
33355
33356	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33357	// operate independently on 128-bit lanes.
33358	unsigned NumElts = VT.getVectorNumElements();
33359	unsigned NumLanes = VT.getSizeInBits()/128;
33360	unsigned NumLaneElts = NumElts / NumLanes;
33361	assert((NumLaneElts % 2 == 0) &&(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane" ) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33362, __PRETTY_FUNCTION__))
33362	"Vector type should have an even number of elements in each lane")(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane" ) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33362, __PRETTY_FUNCTION__));
33363	unsigned HalfLaneElts = NumLaneElts/2;
33364
33365	// View LHS in the form
33366	// LHS = VECTOR_SHUFFLE A, B, LMask
33367	// If LHS is not a shuffle then pretend it is the shuffle
33368	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33369	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
33370	// type VT.
33371	SDValue A, B;
33372	SmallVector<int, 16> LMask(NumElts);
33373	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33374	if (!LHS.getOperand(0).isUndef())
33375	A = LHS.getOperand(0);
33376	if (!LHS.getOperand(1).isUndef())
33377	B = LHS.getOperand(1);
33378	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33379	std::copy(Mask.begin(), Mask.end(), LMask.begin());
33380	} else {
33381	if (!LHS.isUndef())
33382	A = LHS;
33383	for (unsigned i = 0; i != NumElts; ++i)
33384	LMask[i] = i;
33385	}
33386
33387	// Likewise, view RHS in the form
33388	// RHS = VECTOR_SHUFFLE C, D, RMask
33389	SDValue C, D;
33390	SmallVector<int, 16> RMask(NumElts);
33391	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33392	if (!RHS.getOperand(0).isUndef())
33393	C = RHS.getOperand(0);
33394	if (!RHS.getOperand(1).isUndef())
33395	D = RHS.getOperand(1);
33396	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33397	std::copy(Mask.begin(), Mask.end(), RMask.begin());
33398	} else {
33399	if (!RHS.isUndef())
33400	C = RHS;
33401	for (unsigned i = 0; i != NumElts; ++i)
33402	RMask[i] = i;
33403	}
33404
33405	// Check that the shuffles are both shuffling the same vectors.
33406	if (!(A == C && B == D) && !(A == D && B == C))
33407	return false;
33408
33409	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33410	if (!A.getNode() && !B.getNode())
33411	return false;
33412
33413	// If A and B occur in reverse order in RHS, then "swap" them (which means
33414	// rewriting the mask).
33415	if (A != C)
33416	ShuffleVectorSDNode::commuteMask(RMask);
33417
33418	// At this point LHS and RHS are equivalent to
33419	// LHS = VECTOR_SHUFFLE A, B, LMask
33420	// RHS = VECTOR_SHUFFLE A, B, RMask
33421	// Check that the masks correspond to performing a horizontal operation.
33422	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33423	for (unsigned i = 0; i != NumLaneElts; ++i) {
33424	int LIdx = LMask[i+l], RIdx = RMask[i+l];
33425
33426	// Ignore any UNDEF components.
33427	if (LIdx < 0 \|\| RIdx < 0 \|\|
33428	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
33429	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
33430	continue;
33431
33432	// Check that successive elements are being operated on. If not, this is
33433	// not a horizontal operation.
33434	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33435	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
33436	if (!(LIdx == Index && RIdx == Index + 1) &&
33437	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33438	return false;
33439	}
33440	}
33441
33442	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33443	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33444	return true;
33445	}
33446
33447	/// Do target-specific dag combines on floating-point adds/subs.
33448	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33449	const X86Subtarget &Subtarget) {
33450	EVT VT = N->getValueType(0);
33451	SDValue LHS = N->getOperand(0);
33452	SDValue RHS = N->getOperand(1);
33453	bool IsFadd = N->getOpcode() == ISD::FADD;
33454	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode" ) ? static_cast<void> (0) : __assert_fail ("(IsFadd \|\| N->getOpcode() == ISD::FSUB) && \"Wrong opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33454, __PRETTY_FUNCTION__));
33455
33456	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
33457	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
33458	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
33459	isHorizontalBinOp(LHS, RHS, IsFadd)) {
33460	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33461	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33462	}
33463	return SDValue();
33464	}
33465
33466	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33467	/// the codegen.
33468	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33469	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33470	const X86Subtarget &Subtarget,
33471	SDLoc &DL) {
33472	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode" ) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33472, __PRETTY_FUNCTION__));
33473	SDValue Src = N->getOperand(0);
33474	unsigned Opcode = Src.getOpcode();
33475	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33476
33477	EVT VT = N->getValueType(0);
33478	EVT SrcVT = Src.getValueType();
33479
33480	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33481	unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33482
33483	// Repeated operand, so we are only trading one output truncation for
33484	// one input truncation.
33485	if (Op0 == Op1)
33486	return true;
33487
33488	// See if either operand has been extended from a smaller/equal size to
33489	// the truncation size, allowing a truncation to combine with the extend.
33490	unsigned Opcode0 = Op0.getOpcode();
33491	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
33492	Opcode0 == ISD::ZERO_EXTEND) &&
33493	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33494	return true;
33495
33496	unsigned Opcode1 = Op1.getOpcode();
33497	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
33498	Opcode1 == ISD::ZERO_EXTEND) &&
33499	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33500	return true;
33501
33502	// See if either operand is a single use constant which can be constant
33503	// folded.
33504	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33505	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33506	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
33507	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33508	};
33509
33510	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33511	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33512	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33513	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33514	};
33515
33516	// Don't combine if the operation has other uses.
33517	if (!N->isOnlyUserOf(Src.getNode()))
33518	return SDValue();
33519
33520	// Only support vector truncation for now.
33521	// TODO: i64 scalar math would benefit as well.
33522	if (!VT.isVector())
33523	return SDValue();
33524
33525	// In most cases its only worth pre-truncating if we're only facing the cost
33526	// of one truncation.
33527	// i.e. if one of the inputs will constant fold or the input is repeated.
33528	switch (Opcode) {
33529	case ISD::AND:
33530	case ISD::XOR:
33531	case ISD::OR: {
33532	SDValue Op0 = Src.getOperand(0);
33533	SDValue Op1 = Src.getOperand(1);
33534	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33535	IsRepeatedOpOrFreeTruncation(Op0, Op1))
33536	return TruncateArithmetic(Op0, Op1);
33537	break;
33538	}
33539
33540	case ISD::MUL:
33541	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33542	// better to truncate if we have the chance.
33543	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33544	!TLI.isOperationLegal(Opcode, SrcVT))
33545	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33546	LLVM_FALLTHROUGH[[clang::fallthrough]];
33547	case ISD::ADD: {
33548	SDValue Op0 = Src.getOperand(0);
33549	SDValue Op1 = Src.getOperand(1);
33550	if (TLI.isOperationLegal(Opcode, VT) &&
33551	IsRepeatedOpOrFreeTruncation(Op0, Op1))
33552	return TruncateArithmetic(Op0, Op1);
33553	break;
33554	}
33555	}
33556
33557	return SDValue();
33558	}
33559
33560	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33561	static SDValue
33562	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33563	SmallVector<SDValue, 8> &Regs) {
33564	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|((Regs.size() > 0 && (Regs[0].getValueType() == MVT ::v4i32 \|\| Regs[0].getValueType() == MVT::v2i64)) ? static_cast <void> (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\| Regs[0].getValueType() == MVT::v2i64)" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33565, __PRETTY_FUNCTION__))
33565	Regs[0].getValueType() == MVT::v2i64))((Regs.size() > 0 && (Regs[0].getValueType() == MVT ::v4i32 \|\| Regs[0].getValueType() == MVT::v2i64)) ? static_cast <void> (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\| Regs[0].getValueType() == MVT::v2i64)" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33565, __PRETTY_FUNCTION__));
33566	EVT OutVT = N->getValueType(0);
33567	EVT OutSVT = OutVT.getVectorElementType();
33568	EVT InVT = Regs[0].getValueType();
33569	EVT InSVT = InVT.getVectorElementType();
33570	SDLoc DL(N);
33571
33572	// First, use mask to unset all bits that won't appear in the result.
33573	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&(((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && "OutSVT can only be either i8 or i16." ) ? static_cast<void> (0) : __assert_fail ("(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33574, __PRETTY_FUNCTION__))
33574	"OutSVT can only be either i8 or i16.")(((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && "OutSVT can only be either i8 or i16." ) ? static_cast<void> (0) : __assert_fail ("(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33574, __PRETTY_FUNCTION__));
33575	APInt Mask =
33576	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33577	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33578	for (auto &Reg : Regs)
33579	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33580
33581	MVT UnpackedVT, PackedVT;
33582	if (OutSVT == MVT::i8) {
33583	UnpackedVT = MVT::v8i16;
33584	PackedVT = MVT::v16i8;
33585	} else {
33586	UnpackedVT = MVT::v4i32;
33587	PackedVT = MVT::v8i16;
33588	}
33589
33590	// In each iteration, truncate the type by a half size.
33591	auto RegNum = Regs.size();
33592	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33593	j < e; j *= 2, RegNum /= 2) {
33594	for (unsigned i = 0; i < RegNum; i++)
33595	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33596	for (unsigned i = 0; i < RegNum / 2; i++)
33597	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33598	Regs[i * 2 + 1]);
33599	}
33600
33601	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33602	// then extract a subvector as the result since v8i8 is not a legal type.
33603	if (OutVT == MVT::v8i8) {
33604	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33605	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33606	DAG.getIntPtrConstant(0, DL));
33607	return Regs[0];
33608	} else if (RegNum > 1) {
33609	Regs.resize(RegNum);
33610	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33611	} else
33612	return Regs[0];
33613	}
33614
33615	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33616	static SDValue
33617	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33618	SelectionDAG &DAG,
33619	SmallVector<SDValue, 8> &Regs) {
33620	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32)((Regs.size() > 0 && Regs[0].getValueType() == MVT ::v4i32) ? static_cast<void> (0) : __assert_fail ("Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33620, __PRETTY_FUNCTION__));
33621	EVT OutVT = N->getValueType(0);
33622	SDLoc DL(N);
33623
33624	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33625	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33626	for (auto &Reg : Regs) {
33627	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33628	Subtarget, DAG);
33629	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33630	Subtarget, DAG);
33631	}
33632
33633	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33634	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33635	Regs[i * 2 + 1]);
33636
33637	if (Regs.size() > 2) {
33638	Regs.resize(Regs.size() / 2);
33639	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33640	} else
33641	return Regs[0];
33642	}
33643
33644	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33645	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33646	/// legalization the truncation will be translated into a BUILD_VECTOR with each
33647	/// element that is extracted from a vector and then truncated, and it is
33648	/// difficult to do this optimization based on them.
33649	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33650	const X86Subtarget &Subtarget) {
33651	EVT OutVT = N->getValueType(0);
33652	if (!OutVT.isVector())
33653	return SDValue();
33654
33655	SDValue In = N->getOperand(0);
33656	if (!In.getValueType().isSimple())
33657	return SDValue();
33658
33659	EVT InVT = In.getValueType();
33660	unsigned NumElems = OutVT.getVectorNumElements();
33661
33662	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33663	// SSE2, and we need to take care of it specially.
33664	// AVX512 provides vpmovdb.
33665	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
33666	return SDValue();
33667
33668	EVT OutSVT = OutVT.getVectorElementType();
33669	EVT InSVT = InVT.getVectorElementType();
33670	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
33671	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33672	NumElems >= 8))
33673	return SDValue();
33674
33675	// SSSE3's pshufb results in less instructions in the cases below.
33676	if (Subtarget.hasSSSE3() && NumElems == 8 &&
33677	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
33678	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
33679	return SDValue();
33680
33681	SDLoc DL(N);
33682
33683	// Split a long vector into vectors of legal type.
33684	unsigned RegNum = InVT.getSizeInBits() / 128;
33685	SmallVector<SDValue, 8> SubVec(RegNum);
33686	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33687	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33688
33689	for (unsigned i = 0; i < RegNum; i++)
33690	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33691	DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33692
33693	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33694	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33695	// truncate 2 x v4i32 to v8i16.
33696	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
33697	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33698	else if (InSVT == MVT::i32)
33699	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33700	else
33701	return SDValue();
33702	}
33703
33704	/// This function transforms vector truncation of 'all or none' bits values.
33705	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33706	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33707	SelectionDAG &DAG,
33708	const X86Subtarget &Subtarget) {
33709	// Requires SSE2 but AVX512 has fast truncate.
33710	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
33711	return SDValue();
33712
33713	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
33714	return SDValue();
33715
33716	SDValue In = N->getOperand(0);
33717	if (!In.getValueType().isSimple())
33718	return SDValue();
33719
33720	MVT VT = N->getValueType(0).getSimpleVT();
33721	MVT SVT = VT.getScalarType();
33722
33723	MVT InVT = In.getValueType().getSimpleVT();
33724	MVT InSVT = InVT.getScalarType();
33725
33726	// Use PACKSS if the input is a splatted sign bit.
33727	// e.g. Comparison result, sext_in_reg, etc.
33728	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33729	if (NumSignBits != InSVT.getSizeInBits())
33730	return SDValue();
33731
33732	// Check we have a truncation suited for PACKSS.
33733	if (!VT.is128BitVector() && !VT.is256BitVector())
33734	return SDValue();
33735	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33736	return SDValue();
33737	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33738	return SDValue();
33739
33740	return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33741	}
33742
33743	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33744	const X86Subtarget &Subtarget) {
33745	EVT VT = N->getValueType(0);
33746	SDValue Src = N->getOperand(0);
33747	SDLoc DL(N);
33748
33749	// Attempt to pre-truncate inputs to arithmetic ops instead.
33750	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33751	return V;
33752
33753	// Try to detect AVG pattern first.
33754	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33755	return Avg;
33756
33757	// Try to combine truncation with unsigned saturation.
33758	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33759	return Val;
33760
33761	// The bitcast source is a direct mmx result.
33762	// Detect bitcasts between i32 to x86mmx
33763	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33764	SDValue BCSrc = Src.getOperand(0);
33765	if (BCSrc.getValueType() == MVT::x86mmx)
33766	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33767	}
33768
33769	// Try to truncate extended sign bits with PACKSS.
33770	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33771	return V;
33772
33773	return combineVectorTruncation(N, DAG, Subtarget);
33774	}
33775
33776	/// Returns the negated value if the node \p N flips sign of FP value.
33777	///
33778	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33779	/// AVX512F does not have FXOR, so FNEG is lowered as
33780	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33781	/// In this case we go though all bitcasts.
33782	static SDValue isFNEG(SDNode *N) {
33783	if (N->getOpcode() == ISD::FNEG)
33784	return N->getOperand(0);
33785
33786	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33787	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33788	return SDValue();
33789
33790	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33791	if (!Op1.getValueType().isFloatingPoint())
33792	return SDValue();
33793
33794	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33795
33796	unsigned EltBits = Op1.getScalarValueSizeInBits();
33797	auto isSignMask = [&](const ConstantFP *C) {
33798	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33799	};
33800
33801	// There is more than one way to represent the same constant on
33802	// the different X86 targets. The type of the node may also depend on size.
33803	// - load scalar value and broadcast
33804	// - BUILD_VECTOR node
33805	// - load from a constant pool.
33806	// We check all variants here.
33807	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33808	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33809	if (isSignMask(cast<ConstantFP>(C)))
33810	return Op0;
33811
33812	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33813	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33814	if (isSignMask(CN->getConstantFPValue()))
33815	return Op0;
33816
33817	} else if (auto *C = getTargetConstantFromNode(Op1)) {
33818	if (C->getType()->isVectorTy()) {
33819	if (auto *SplatV = C->getSplatValue())
33820	if (isSignMask(cast<ConstantFP>(SplatV)))
33821	return Op0;
33822	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33823	if (isSignMask(FPConst))
33824	return Op0;
33825	}
33826	return SDValue();
33827	}
33828
33829	/// Do target-specific dag combines on floating point negations.
33830	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33831	const X86Subtarget &Subtarget) {
33832	EVT OrigVT = N->getValueType(0);
33833	SDValue Arg = isFNEG(N);
33834	assert(Arg.getNode() && "N is expected to be an FNEG node")((Arg.getNode() && "N is expected to be an FNEG node" ) ? static_cast<void> (0) : __assert_fail ("Arg.getNode() && \"N is expected to be an FNEG node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33834, __PRETTY_FUNCTION__));
33835
33836	EVT VT = Arg.getValueType();
33837	EVT SVT = VT.getScalarType();
33838	SDLoc DL(N);
33839
33840	// Let legalize expand this if it isn't a legal type yet.
33841	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33842	return SDValue();
33843
33844	// If we're negating a FMUL node on a target with FMA, then we can avoid the
33845	// use of a constant by performing (-0 - A*B) instead.
33846	// FIXME: Check rounding control flags as well once it becomes available.
33847	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
33848	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33849	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33850	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33851	Arg.getOperand(1), Zero);
33852	return DAG.getBitcast(OrigVT, NewNode);
33853	}
33854
33855	// If we're negating an FMA node, then we can adjust the
33856	// instruction to include the extra negation.
33857	unsigned NewOpcode = 0;
33858	if (Arg.hasOneUse()) {
33859	switch (Arg.getOpcode()) {
33860	case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33861	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33862	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33863	case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33864	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33865	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33866	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33867	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33868	// We can't handle scalar intrinsic node here because it would only
33869	// invert one element and not the whole vector. But we could try to handle
33870	// a negation of the lower element only.
33871	}
33872	}
33873	if (NewOpcode)
33874	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33875	Arg.getNode()->ops()));
33876
33877	return SDValue();
33878	}
33879
33880	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33881	const X86Subtarget &Subtarget) {
33882	MVT VT = N->getSimpleValueType(0);
33883	// If we have integer vector types available, use the integer opcodes.
33884	if (VT.isVector() && Subtarget.hasSSE2()) {
33885	SDLoc dl(N);
33886
33887	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33888
33889	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33890	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33891	unsigned IntOpcode;
33892	switch (N->getOpcode()) {
33893	default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 33893);
33894	case X86ISD::FOR: IntOpcode = ISD::OR; break;
33895	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33896	case X86ISD::FAND: IntOpcode = ISD::AND; break;
33897	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33898	}
33899	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33900	return DAG.getBitcast(VT, IntOp);
33901	}
33902	return SDValue();
33903	}
33904
33905	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33906	TargetLowering::DAGCombinerInfo &DCI,
33907	const X86Subtarget &Subtarget) {
33908	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33909	return Cmp;
33910
33911	if (DCI.isBeforeLegalizeOps())
33912	return SDValue();
33913
33914	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33915	return RV;
33916
33917	if (Subtarget.hasCMov())
33918	if (SDValue RV = combineIntegerAbs(N, DAG))
33919	return RV;
33920
33921	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33922	return FPLogic;
33923
33924	if (isFNEG(N))
33925	return combineFneg(N, DAG, Subtarget);
33926	return SDValue();
33927	}
33928
33929
33930	static bool isNullFPScalarOrVectorConst(SDValue V) {
33931	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
33932	}
33933
33934	/// If a value is a scalar FP zero or a vector FP zero (potentially including
33935	/// undefined elements), return a zero constant that may be used to fold away
33936	/// that value. In the case of a vector, the returned constant will not contain
33937	/// undefined elements even if the input parameter does. This makes it suitable
33938	/// to be used as a replacement operand with operations (eg, bitwise-and) where
33939	/// an undef should not propagate.
33940	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33941	const X86Subtarget &Subtarget) {
33942	if (!isNullFPScalarOrVectorConst(V))
33943	return SDValue();
33944
33945	if (V.getValueType().isVector())
33946	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33947
33948	return V;
33949	}
33950
33951	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33952	const X86Subtarget &Subtarget) {
33953	SDValue N0 = N->getOperand(0);
33954	SDValue N1 = N->getOperand(1);
33955	EVT VT = N->getValueType(0);
33956	SDLoc DL(N);
33957
33958	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33959	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
33960	(VT == MVT::f64 && Subtarget.hasSSE2())))
33961	return SDValue();
33962
33963	auto isAllOnesConstantFP = [](SDValue V) {
33964	auto *C = dyn_cast<ConstantFPSDNode>(V);
33965	return C && C->getConstantFPValue()->isAllOnesValue();
33966	};
33967
33968	// fand (fxor X, -1), Y --> fandn X, Y
33969	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33970	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33971
33972	// fand X, (fxor Y, -1) --> fandn Y, X
33973	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33974	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33975
33976	return SDValue();
33977	}
33978
33979	/// Do target-specific dag combines on X86ISD::FAND nodes.
33980	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33981	const X86Subtarget &Subtarget) {
33982	// FAND(0.0, x) -> 0.0
33983	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33984	return V;
33985
33986	// FAND(x, 0.0) -> 0.0
33987	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33988	return V;
33989
33990	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33991	return V;
33992
33993	return lowerX86FPLogicOp(N, DAG, Subtarget);
33994	}
33995
33996	/// Do target-specific dag combines on X86ISD::FANDN nodes.
33997	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33998	const X86Subtarget &Subtarget) {
33999	// FANDN(0.0, x) -> x
34000	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34001	return N->getOperand(1);
34002
34003	// FANDN(x, 0.0) -> 0.0
34004	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34005	return V;
34006
34007	return lowerX86FPLogicOp(N, DAG, Subtarget);
34008	}
34009
34010	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34011	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34012	const X86Subtarget &Subtarget) {
34013	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD ::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34013, __PRETTY_FUNCTION__));
34014
34015	// F[X]OR(0.0, x) -> x
34016	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34017	return N->getOperand(1);
34018
34019	// F[X]OR(x, 0.0) -> x
34020	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34021	return N->getOperand(0);
34022
34023	if (isFNEG(N))
34024	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34025	return NewVal;
34026
34027	return lowerX86FPLogicOp(N, DAG, Subtarget);
34028	}
34029
34030	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34031	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34032	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD ::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34032, __PRETTY_FUNCTION__));
34033
34034	// Only perform optimizations if UnsafeMath is used.
34035	if (!DAG.getTarget().Options.UnsafeFPMath)
34036	return SDValue();
34037
34038	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34039	// into FMINC and FMAXC, which are Commutative operations.
34040	unsigned NewOp = 0;
34041	switch (N->getOpcode()) {
34042	default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34042);
34043	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
34044	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
34045	}
34046
34047	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34048	N->getOperand(0), N->getOperand(1));
34049	}
34050
34051	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34052	const X86Subtarget &Subtarget) {
34053	if (Subtarget.useSoftFloat())
34054	return SDValue();
34055
34056	// TODO: Check for global or instruction-level "nnan". In that case, we
34057	// should be able to lower to FMAX/FMIN alone.
34058	// TODO: If an operand is already known to be a NaN or not a NaN, this
34059	// should be an optional swap and FMAX/FMIN.
34060
34061	EVT VT = N->getValueType(0);
34062	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
34063	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
34064	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
34065	return SDValue();
34066
34067	// This takes at least 3 instructions, so favor a library call when operating
34068	// on a scalar and minimizing code size.
34069	if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34070	return SDValue();
34071
34072	SDValue Op0 = N->getOperand(0);
34073	SDValue Op1 = N->getOperand(1);
34074	SDLoc DL(N);
34075	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34076	DAG.getDataLayout(), *DAG.getContext(), VT);
34077
34078	// There are 4 possibilities involving NaN inputs, and these are the required
34079	// outputs:
34080	// Op1
34081	// Num NaN
34082	// ----------------
34083	// Num \| Max \| Op0 \|
34084	// Op0 ----------------
34085	// NaN \| Op1 \| NaN \|
34086	// ----------------
34087	//
34088	// The SSE FP max/min instructions were not designed for this case, but rather
34089	// to implement:
34090	// Min = Op1 < Op0 ? Op1 : Op0
34091	// Max = Op1 > Op0 ? Op1 : Op0
34092	//
34093	// So they always return Op0 if either input is a NaN. However, we can still
34094	// use those instructions for fmaxnum by selecting away a NaN input.
34095
34096	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
34097	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34098	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34099	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34100
34101	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34102	// are NaN, the NaN value of Op1 is the result.
34103	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34104	}
34105
34106	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
34107	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34108	TargetLowering::DAGCombinerInfo &DCI,
34109	const X86Subtarget &Subtarget) {
34110	// ANDNP(0, x) -> x
34111	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34112	return N->getOperand(1);
34113
34114	// ANDNP(x, 0) -> 0
34115	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34116	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34117
34118	EVT VT = N->getValueType(0);
34119
34120	// Attempt to recursively combine a bitmask ANDNP with shuffles.
34121	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34122	SDValue Op(N, 0);
34123	SmallVector<int, 1> NonceMask; // Just a placeholder.
34124	NonceMask.push_back(0);
34125	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34126	/Depth/ 1, /HasVarMask/ false, DAG,
34127	DCI, Subtarget))
34128	return SDValue(); // This routine will use CombineTo to replace N.
34129	}
34130
34131	return SDValue();
34132	}
34133
34134	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34135	TargetLowering::DAGCombinerInfo &DCI) {
34136	// BT ignores high bits in the bit index operand.
34137	SDValue Op1 = N->getOperand(1);
34138	if (Op1.hasOneUse()) {
34139	unsigned BitWidth = Op1.getValueSizeInBits();
34140	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34141	KnownBits Known;
34142	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34143	!DCI.isBeforeLegalizeOps());
34144	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34145	if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) \|\|
34146	TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34147	DCI.CommitTargetLoweringOpt(TLO);
34148	}
34149	return SDValue();
34150	}
34151
34152	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34153	const X86Subtarget &Subtarget) {
34154	EVT VT = N->getValueType(0);
34155	if (!VT.isVector())
34156	return SDValue();
34157
34158	SDValue N0 = N->getOperand(0);
34159	SDValue N1 = N->getOperand(1);
34160	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34161	SDLoc dl(N);
34162
34163	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34164	// both SSE and AVX2 since there is no sign-extended shift right
34165	// operation on a vector with 64-bit elements.
34166	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34167	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34168	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
34169	N0.getOpcode() == ISD::SIGN_EXTEND)) {
34170	SDValue N00 = N0.getOperand(0);
34171
34172	// EXTLOAD has a better solution on AVX2,
34173	// it may be replaced with X86ISD::VSEXT node.
34174	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34175	if (!ISD::isNormalLoad(N00.getNode()))
34176	return SDValue();
34177
34178	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34179	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34180	N00, N1);
34181	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34182	}
34183	}
34184	return SDValue();
34185	}
34186
34187	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34188	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34189	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34190	/// opportunities to combine math ops, use an LEA, or use a complex addressing
34191	/// mode. This can eliminate extend, add, and shift instructions.
34192	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34193	const X86Subtarget &Subtarget) {
34194	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34195	Ext->getOpcode() != ISD::ZERO_EXTEND)
34196	return SDValue();
34197
34198	// TODO: This should be valid for other integer types.
34199	EVT VT = Ext->getValueType(0);
34200	if (VT != MVT::i64)
34201	return SDValue();
34202
34203	SDValue Add = Ext->getOperand(0);
34204	if (Add.getOpcode() != ISD::ADD)
34205	return SDValue();
34206
34207	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34208	bool NSW = Add->getFlags().hasNoSignedWrap();
34209	bool NUW = Add->getFlags().hasNoUnsignedWrap();
34210
34211	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34212	// into the 'zext'
34213	if ((Sext && !NSW) \|\| (!Sext && !NUW))
34214	return SDValue();
34215
34216	// Having a constant operand to the 'add' ensures that we are not increasing
34217	// the instruction count because the constant is extended for free below.
34218	// A constant operand can also become the displacement field of an LEA.
34219	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34220	if (!AddOp1)
34221	return SDValue();
34222
34223	// Don't make the 'add' bigger if there's no hope of combining it with some
34224	// other 'add' or 'shl' instruction.
34225	// TODO: It may be profitable to generate simpler LEA instructions in place
34226	// of single 'add' instructions, but the cost model for selecting an LEA
34227	// currently has a high threshold.
34228	bool HasLEAPotential = false;
34229	for (auto *User : Ext->uses()) {
34230	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
34231	HasLEAPotential = true;
34232	break;
34233	}
34234	}
34235	if (!HasLEAPotential)
34236	return SDValue();
34237
34238	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
34239	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34240	SDValue AddOp0 = Add.getOperand(0);
34241	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34242	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34243
34244	// The wider add is guaranteed to not wrap because both operands are
34245	// sign-extended.
34246	SDNodeFlags Flags;
34247	Flags.setNoSignedWrap(NSW);
34248	Flags.setNoUnsignedWrap(NUW);
34249	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34250	}
34251
34252	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34253	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34254	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34255	/// extends from AH (which we otherwise need to do contortions to access).
34256	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34257	SDValue N0 = N->getOperand(0);
34258	auto OpcodeN = N->getOpcode();
34259	auto OpcodeN0 = N0.getOpcode();
34260	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
34261	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34262	return SDValue();
34263
34264	EVT VT = N->getValueType(0);
34265	EVT InVT = N0.getValueType();
34266	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\| VT != MVT::i32)
34267	return SDValue();
34268
34269	SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34270	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34271	: X86ISD::UDIVREM8_ZEXT_HREG;
34272	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34273	N0.getOperand(1));
34274	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34275	return R.getValue(1);
34276	}
34277
34278	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34279	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34280	/// with UNDEFs) of the input to vectors of the same size as the target type
34281	/// which then extends the lowest elements.
34282	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34283	TargetLowering::DAGCombinerInfo &DCI,
34284	const X86Subtarget &Subtarget) {
34285	unsigned Opcode = N->getOpcode();
34286	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34287	return SDValue();
34288	if (!DCI.isBeforeLegalizeOps())
34289	return SDValue();
34290	if (!Subtarget.hasSSE2())
34291	return SDValue();
34292
34293	SDValue N0 = N->getOperand(0);
34294	EVT VT = N->getValueType(0);
34295	EVT SVT = VT.getScalarType();
34296	EVT InVT = N0.getValueType();
34297	EVT InSVT = InVT.getScalarType();
34298
34299	// Input type must be a vector and we must be extending legal integer types.
34300	if (!VT.isVector())
34301	return SDValue();
34302	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34303	return SDValue();
34304	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34305	return SDValue();
34306
34307	// On AVX2+ targets, if the input/output types are both legal then we will be
34308	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
34309	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34310	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34311	return SDValue();
34312
34313	SDLoc DL(N);
34314
34315	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34316	EVT InVT = N.getValueType();
34317	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34318	Size / InVT.getScalarSizeInBits());
34319	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34320	DAG.getUNDEF(InVT));
34321	Opnds[0] = N;
34322	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34323	};
34324
34325	// If target-size is less than 128-bits, extend to a type that would extend
34326	// to 128 bits, extend that and extract the original target vector.
34327	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34328	unsigned Scale = 128 / VT.getSizeInBits();
34329	EVT ExVT =
34330	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34331	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34332	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34333	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34334	DAG.getIntPtrConstant(0, DL));
34335	}
34336
34337	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34338	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
34339	// Also use this if we don't have SSE41 to allow the legalizer do its job.
34340	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
34341	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
34342	(VT.is512BitVector() && Subtarget.hasAVX512())) {
34343	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34344	return Opcode == ISD::SIGN_EXTEND
34345	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34346	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34347	}
34348
34349	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34350	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34351	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34352	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34353	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34354
34355	SmallVector<SDValue, 8> Opnds;
34356	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34357	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34358	DAG.getIntPtrConstant(Offset, DL));
34359	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34360	SrcVec = Opcode == ISD::SIGN_EXTEND
34361	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34362	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34363	Opnds.push_back(SrcVec);
34364	}
34365	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34366	};
34367
34368	// On pre-AVX2 targets, split into 128-bit nodes of
34369	// ISD::*_EXTEND_VECTOR_INREG.
34370	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34371	return SplitAndExtendInReg(128);
34372
34373	// On pre-AVX512 targets, split into 256-bit nodes of
34374	// ISD::*_EXTEND_VECTOR_INREG.
34375	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34376	return SplitAndExtendInReg(256);
34377
34378	return SDValue();
34379	}
34380
34381	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34382	TargetLowering::DAGCombinerInfo &DCI,
34383	const X86Subtarget &Subtarget) {
34384	SDValue N0 = N->getOperand(0);
34385	EVT VT = N->getValueType(0);
34386	EVT InVT = N0.getValueType();
34387	SDLoc DL(N);
34388
34389	if (SDValue DivRem8 = getDivRem8(N, DAG))
34390	return DivRem8;
34391
34392	if (!DCI.isBeforeLegalizeOps()) {
34393	if (InVT == MVT::i1) {
34394	SDValue Zero = DAG.getConstant(0, DL, VT);
34395	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34396	return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34397	}
34398	return SDValue();
34399	}
34400
34401	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34402	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34403	// Invert and sign-extend a boolean is the same as zero-extend and subtract
34404	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34405	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34406	// sext (xor Bool, -1) --> sub (zext Bool), 1
34407	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34408	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34409	}
34410
34411	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34412	return V;
34413
34414	if (Subtarget.hasAVX() && VT.is256BitVector())
34415	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34416	return R;
34417
34418	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34419	return NewAdd;
34420
34421	return SDValue();
34422	}
34423
34424	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34425	const X86Subtarget &Subtarget) {
34426	SDLoc dl(N);
34427	EVT VT = N->getValueType(0);
34428
34429	// Let legalize expand this if it isn't a legal type yet.
34430	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34431	return SDValue();
34432
34433	EVT ScalarVT = VT.getScalarType();
34434	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
34435	return SDValue();
34436
34437	SDValue A = N->getOperand(0);
34438	SDValue B = N->getOperand(1);
34439	SDValue C = N->getOperand(2);
34440
34441	auto invertIfNegative = [](SDValue &V) {
34442	if (SDValue NegVal = isFNEG(V.getNode())) {
34443	V = NegVal;
34444	return true;
34445	}
34446	return false;
34447	};
34448
34449	// Do not convert the passthru input of scalar intrinsics.
34450	// FIXME: We could allow negations of the lower element only.
34451	bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34452	bool NegB = invertIfNegative(B);
34453	bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34454
34455	// Negative multiplication when NegA xor NegB
34456	bool NegMul = (NegA != NegB);
34457
34458	unsigned NewOpcode;
34459	if (!NegMul)
34460	NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34461	else
34462	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34463
34464
34465	if (N->getOpcode() == X86ISD::FMADD_RND) {
34466	switch (NewOpcode) {
34467	case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34468	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34469	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34470	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34471	}
34472	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34473	switch (NewOpcode) {
34474	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34475	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34476	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34477	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34478	}
34479	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34480	switch (NewOpcode) {
34481	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34482	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34483	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34484	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34485	}
34486	} else {
34487	assert((N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) &&(((N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) && "Unexpected opcode!") ? static_cast< void> (0) : __assert_fail ("(N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) && \"Unexpected opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34488, __PRETTY_FUNCTION__))
34488	"Unexpected opcode!")(((N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) && "Unexpected opcode!") ? static_cast< void> (0) : __assert_fail ("(N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) && \"Unexpected opcode!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34488, __PRETTY_FUNCTION__));
34489	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34490	}
34491
34492	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34493	}
34494
34495	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34496	TargetLowering::DAGCombinerInfo &DCI,
34497	const X86Subtarget &Subtarget) {
34498	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34499	// (and (i32 x86isd::setcc_carry), 1)
34500	// This eliminates the zext. This transformation is necessary because
34501	// ISD::SETCC is always legalized to i8.
34502	SDLoc dl(N);
34503	SDValue N0 = N->getOperand(0);
34504	EVT VT = N->getValueType(0);
34505
34506	if (N0.getOpcode() == ISD::AND &&
34507	N0.hasOneUse() &&
34508	N0.getOperand(0).hasOneUse()) {
34509	SDValue N00 = N0.getOperand(0);
34510	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34511	if (!isOneConstant(N0.getOperand(1)))
34512	return SDValue();
34513	return DAG.getNode(ISD::AND, dl, VT,
34514	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34515	N00.getOperand(0), N00.getOperand(1)),
34516	DAG.getConstant(1, dl, VT));
34517	}
34518	}
34519
34520	if (N0.getOpcode() == ISD::TRUNCATE &&
34521	N0.hasOneUse() &&
34522	N0.getOperand(0).hasOneUse()) {
34523	SDValue N00 = N0.getOperand(0);
34524	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34525	return DAG.getNode(ISD::AND, dl, VT,
34526	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34527	N00.getOperand(0), N00.getOperand(1)),
34528	DAG.getConstant(1, dl, VT));
34529	}
34530	}
34531
34532	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34533	return V;
34534
34535	if (VT.is256BitVector())
34536	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34537	return R;
34538
34539	if (SDValue DivRem8 = getDivRem8(N, DAG))
34540	return DivRem8;
34541
34542	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34543	return NewAdd;
34544
34545	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34546	return R;
34547
34548	return SDValue();
34549	}
34550
34551	/// Try to map a 128-bit or larger integer comparison to vector instructions
34552	/// before type legalization splits it up into chunks.
34553	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34554	const X86Subtarget &Subtarget) {
34555	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34556	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate" ) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE \|\| CC == ISD::SETEQ) && \"Bad comparison predicate\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34556, __PRETTY_FUNCTION__));
34557
34558	// We're looking for an oversized integer equality comparison, but ignore a
34559	// comparison with zero because that gets special treatment in EmitTest().
34560	SDValue X = SetCC->getOperand(0);
34561	SDValue Y = SetCC->getOperand(1);
34562	EVT OpVT = X.getValueType();
34563	unsigned OpSize = OpVT.getSizeInBits();
34564	if (!OpVT.isScalarInteger() \|\| OpSize < 128 \|\| isNullConstant(Y))
34565	return SDValue();
34566
34567	// TODO: Use PXOR + PTEST for SSE4.1 or later?
34568	// TODO: Add support for AVX-512.
34569	EVT VT = SetCC->getValueType(0);
34570	SDLoc DL(SetCC);
34571	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
34572	(OpSize == 256 && Subtarget.hasAVX2())) {
34573	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34574	SDValue VecX = DAG.getBitcast(VecVT, X);
34575	SDValue VecY = DAG.getBitcast(VecVT, Y);
34576
34577	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34578	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34579	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34580	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34581	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34582	SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34583	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34584	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34585	MVT::i32);
34586	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34587	}
34588
34589	return SDValue();
34590	}
34591
34592	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34593	const X86Subtarget &Subtarget) {
34594	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34595	SDValue LHS = N->getOperand(0);
34596	SDValue RHS = N->getOperand(1);
34597	EVT VT = N->getValueType(0);
34598	SDLoc DL(N);
34599
34600	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
34601	EVT OpVT = LHS.getValueType();
34602	// 0-x == y --> x+y == 0
34603	// 0-x != y --> x+y != 0
34604	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34605	LHS.hasOneUse()) {
34606	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34607	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34608	}
34609	// x == 0-y --> x+y == 0
34610	// x != 0-y --> x+y != 0
34611	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34612	RHS.hasOneUse()) {
34613	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34614	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34615	}
34616
34617	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34618	return V;
34619	}
34620
34621	if (VT.getScalarType() == MVT::i1 &&
34622	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
34623	bool IsSEXT0 =
34624	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34625	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34626	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34627
34628	if (!IsSEXT0 \|\| !IsVZero1) {
34629	// Swap the operands and update the condition code.
34630	std::swap(LHS, RHS);
34631	CC = ISD::getSetCCSwappedOperands(CC);
34632
34633	IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34634	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34635	IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34636	}
34637
34638	if (IsSEXT0 && IsVZero1) {
34639	assert(VT == LHS.getOperand(0).getValueType() &&((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type" ) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34640, __PRETTY_FUNCTION__))
34640	"Uexpected operand type")((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type" ) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34640, __PRETTY_FUNCTION__));
34641	if (CC == ISD::SETGT)
34642	return DAG.getConstant(0, DL, VT);
34643	if (CC == ISD::SETLE)
34644	return DAG.getConstant(1, DL, VT);
34645	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
34646	return DAG.getNOT(DL, LHS.getOperand(0), VT);
34647
34648	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&(((CC == ISD::SETNE \|\| CC == ISD::SETLT) && "Unexpected condition code!" ) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE \|\| CC == ISD::SETLT) && \"Unexpected condition code!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34649, __PRETTY_FUNCTION__))
34649	"Unexpected condition code!")(((CC == ISD::SETNE \|\| CC == ISD::SETLT) && "Unexpected condition code!" ) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE \|\| CC == ISD::SETLT) && \"Unexpected condition code!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34649, __PRETTY_FUNCTION__));
34650	return LHS.getOperand(0);
34651	}
34652	}
34653
34654	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34655	// to avoid scalarization via legalization because v4i32 is not a legal type.
34656	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34657	LHS.getValueType() == MVT::v4f32)
34658	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34659
34660	return SDValue();
34661	}
34662
34663	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34664	SDLoc DL(N);
34665	// Gather and Scatter instructions use k-registers for masks. The type of
34666	// the masks is v*i1. So the mask will be truncated anyway.
34667	// The SIGN_EXTEND_INREG my be dropped.
34668	SDValue Mask = N->getOperand(2);
34669	if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34670	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34671	NewOps[2] = Mask.getOperand(0);
34672	DAG.UpdateNodeOperands(N, NewOps);
34673	}
34674	return SDValue();
34675	}
34676
34677	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34678	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34679	const X86Subtarget &Subtarget) {
34680	SDLoc DL(N);
34681	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34682	SDValue EFLAGS = N->getOperand(1);
34683
34684	// Try to simplify the EFLAGS and condition code operands.
34685	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34686	return getSETCC(CC, Flags, DL, DAG);
34687
34688	return SDValue();
34689	}
34690
34691	/// Optimize branch condition evaluation.
34692	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34693	const X86Subtarget &Subtarget) {
34694	SDLoc DL(N);
34695	SDValue EFLAGS = N->getOperand(3);
34696	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34697
34698	// Try to simplify the EFLAGS and condition code operands.
34699	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
34700	// RAUW them under us.
34701	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34702	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34703	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34704	N->getOperand(1), Cond, Flags);
34705	}
34706
34707	return SDValue();
34708	}
34709
34710	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34711	SelectionDAG &DAG) {
34712	// Take advantage of vector comparisons producing 0 or -1 in each lane to
34713	// optimize away operation when it's from a constant.
34714	//
34715	// The general transformation is:
34716	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34717	// AND(VECTOR_CMP(x,y), constant2)
34718	// constant2 = UNARYOP(constant)
34719
34720	// Early exit if this isn't a vector operation, the operand of the
34721	// unary operation isn't a bitwise AND, or if the sizes of the operations
34722	// aren't the same.
34723	EVT VT = N->getValueType(0);
34724	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
34725	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
34726	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34727	return SDValue();
34728
34729	// Now check that the other operand of the AND is a constant. We could
34730	// make the transformation for non-constant splats as well, but it's unclear
34731	// that would be a benefit as it would not eliminate any operations, just
34732	// perform one more step in scalar code before moving to the vector unit.
34733	if (BuildVectorSDNode *BV =
34734	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34735	// Bail out if the vector isn't a constant.
34736	if (!BV->isConstant())
34737	return SDValue();
34738
34739	// Everything checks out. Build up the new and improved node.
34740	SDLoc DL(N);
34741	EVT IntVT = BV->getValueType(0);
34742	// Create a new constant of the appropriate type for the transformed
34743	// DAG.
34744	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34745	// The AND node needs bitcasts to/from an integer vector type around it.
34746	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34747	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34748	N->getOperand(0)->getOperand(0), MaskConst);
34749	SDValue Res = DAG.getBitcast(VT, NewAnd);
34750	return Res;
34751	}
34752
34753	return SDValue();
34754	}
34755
34756	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34757	const X86Subtarget &Subtarget) {
34758	SDValue Op0 = N->getOperand(0);
34759	EVT VT = N->getValueType(0);
34760	EVT InVT = Op0.getValueType();
34761	EVT InSVT = InVT.getScalarType();
34762	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34763
34764	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34765	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34766	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
34767	SDLoc dl(N);
34768	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34769	InVT.getVectorNumElements());
34770	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34771
34772	if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34773	return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34774
34775	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34776	}
34777
34778	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34779	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34780	// the optimization here.
34781	if (DAG.SignBitIsZero(Op0))
34782	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34783
34784	return SDValue();
34785	}
34786
34787	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34788	const X86Subtarget &Subtarget) {
34789	// First try to optimize away the conversion entirely when it's
34790	// conditionally from a constant. Vectors only.
34791	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34792	return Res;
34793
34794	// Now move on to more general possibilities.
34795	SDValue Op0 = N->getOperand(0);
34796	EVT VT = N->getValueType(0);
34797	EVT InVT = Op0.getValueType();
34798	EVT InSVT = InVT.getScalarType();
34799
34800	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34801	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34802	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34803	if (InVT.isVector() &&
34804	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
34805	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34806	SDLoc dl(N);
34807	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34808	InVT.getVectorNumElements());
34809	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34810	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34811	}
34812
34813	// Without AVX512DQ we only support i64 to float scalar conversion. For both
34814	// vectors and scalars, see if we know that the upper bits are all the sign
34815	// bit, in which case we can truncate the input to i32 and convert from that.
34816	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34817	unsigned BitWidth = InVT.getScalarSizeInBits();
34818	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34819	if (NumSignBits >= (BitWidth - 31)) {
34820	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34821	if (InVT.isVector())
34822	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34823	InVT.getVectorNumElements());
34824	SDLoc dl(N);
34825	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34826	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34827	}
34828	}
34829
34830	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34831	// a 32-bit target where SSE doesn't support i64->FP operations.
34832	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34833	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34834	EVT LdVT = Ld->getValueType(0);
34835
34836	// This transformation is not supported if the result type is f16 or f128.
34837	if (VT == MVT::f16 \|\| VT == MVT::f128)
34838	return SDValue();
34839
34840	if (!Ld->isVolatile() && !VT.isVector() &&
34841	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34842	!Subtarget.is64Bit() && LdVT == MVT::i64) {
34843	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34844	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34845	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34846	return FILDChain;
34847	}
34848	}
34849	return SDValue();
34850	}
34851
34852	// Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34853	static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34854	X86TargetLowering::DAGCombinerInfo &DCI) {
34855	// When legalizing carry, we create carries via add X, -1
34856	// If that comes from an actual carry, via setcc, we use the
34857	// carry directly.
34858	if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34859	SDValue Carry = N->getOperand(0);
34860	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
34861	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
34862	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
34863	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
34864	(Carry.getOpcode() == ISD::AND &&
34865	isOneConstant(Carry.getOperand(1))))
34866	Carry = Carry.getOperand(0);
34867
34868	if (Carry.getOpcode() == X86ISD::SETCC \|\|
34869	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34870	if (Carry.getConstantOperandVal(0) == X86::COND_B)
34871	return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34872	}
34873	}
34874
34875	return SDValue();
34876	}
34877
34878	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34879	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34880	X86TargetLowering::DAGCombinerInfo &DCI) {
34881	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
34882	// the result is either zero or one (depending on the input carry bit).
34883	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34884	if (X86::isZeroNode(N->getOperand(0)) &&
34885	X86::isZeroNode(N->getOperand(1)) &&
34886	// We don't have a good way to replace an EFLAGS use, so only do this when
34887	// dead right now.
34888	SDValue(N, 1).use_empty()) {
34889	SDLoc DL(N);
34890	EVT VT = N->getValueType(0);
34891	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34892	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34893	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34894	DAG.getConstant(X86::COND_B, DL,
34895	MVT::i8),
34896	N->getOperand(2)),
34897	DAG.getConstant(1, DL, VT));
34898	return DCI.CombineTo(N, Res1, CarryOut);
34899	}
34900
34901	return SDValue();
34902	}
34903
34904	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34905	/// which is more useful than 0/1 in some cases.
34906	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34907	SDLoc DL(N);
34908	// "Condition code B" is also known as "the carry flag" (CF).
34909	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34910	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34911	MVT VT = N->getSimpleValueType(0);
34912	if (VT == MVT::i8)
34913	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34914
34915	assert(VT == MVT::i1 && "Unexpected type for SETCC node")((VT == MVT::i1 && "Unexpected type for SETCC node") ? static_cast<void> (0) : __assert_fail ("VT == MVT::i1 && \"Unexpected type for SETCC node\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 34915, __PRETTY_FUNCTION__));
34916	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34917	}
34918
34919	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
34920	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34921	/// with CMP+{ADC, SBB}.
34922	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34923	bool IsSub = N->getOpcode() == ISD::SUB;
34924	SDValue X = N->getOperand(0);
34925	SDValue Y = N->getOperand(1);
34926
34927	// If this is an add, canonicalize a zext operand to the RHS.
34928	// TODO: Incomplete? What if both sides are zexts?
34929	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34930	Y.getOpcode() != ISD::ZERO_EXTEND)
34931	std::swap(X, Y);
34932
34933	// Look through a one-use zext.
34934	bool PeekedThroughZext = false;
34935	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34936	Y = Y.getOperand(0);
34937	PeekedThroughZext = true;
34938	}
34939
34940	// If this is an add, canonicalize a setcc operand to the RHS.
34941	// TODO: Incomplete? What if both sides are setcc?
34942	// TODO: Should we allow peeking through a zext of the other operand?
34943	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34944	Y.getOpcode() != X86ISD::SETCC)
34945	std::swap(X, Y);
34946
34947	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
34948	return SDValue();
34949
34950	SDLoc DL(N);
34951	EVT VT = N->getValueType(0);
34952	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34953
34954	if (CC == X86::COND_B) {
34955	// X + SETB Z --> X + (mask SBB Z, Z)
34956	// X - SETB Z --> X - (mask SBB Z, Z)
34957	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34958	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34959	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34960	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34961	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34962	}
34963
34964	if (CC == X86::COND_A) {
34965	SDValue EFLAGS = Y->getOperand(1);
34966	// Try to convert COND_A into COND_B in an attempt to facilitate
34967	// materializing "setb reg".
34968	//
34969	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
34970	// cannot take an immediate as its first operand.
34971	//
34972	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34973	EFLAGS.getValueType().isInteger() &&
34974	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34975	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34976	EFLAGS.getNode()->getVTList(),
34977	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34978	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34979	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34980	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34981	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34982	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34983	}
34984	}
34985
34986	if (CC != X86::COND_E && CC != X86::COND_NE)
34987	return SDValue();
34988
34989	SDValue Cmp = Y.getOperand(1);
34990	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
34991	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
34992	!Cmp.getOperand(0).getValueType().isInteger())
34993	return SDValue();
34994
34995	SDValue Z = Cmp.getOperand(0);
34996	EVT ZVT = Z.getValueType();
34997
34998	// If X is -1 or 0, then we have an opportunity to avoid constants required in
34999	// the general case below.
35000	if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
35001	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35002	// fake operands:
35003	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35004	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35005	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
35006	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35007	SDValue Zero = DAG.getConstant(0, DL, ZVT);
35008	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35009	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35010	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35011	DAG.getConstant(X86::COND_B, DL, MVT::i8),
35012	SDValue(Neg.getNode(), 1));
35013	}
35014
35015	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35016	// with fake operands:
35017	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35018	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35019	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
35020	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35021	SDValue One = DAG.getConstant(1, DL, ZVT);
35022	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35023	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35024	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35025	}
35026	}
35027
35028	// (cmp Z, 1) sets the carry flag if Z is 0.
35029	SDValue One = DAG.getConstant(1, DL, ZVT);
35030	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35031
35032	// Add the flags type for ADC/SBB nodes.
35033	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35034
35035	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35036	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35037	if (CC == X86::COND_NE)
35038	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35039	DAG.getConstant(-1ULL, DL, VT), Cmp1);
35040
35041	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
35042	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
35043	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35044	DAG.getConstant(0, DL, VT), Cmp1);
35045	}
35046
35047	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35048	const X86Subtarget &Subtarget) {
35049	SDValue MulOp = N->getOperand(0);
35050	SDValue Phi = N->getOperand(1);
35051
35052	if (MulOp.getOpcode() != ISD::MUL)
35053	std::swap(MulOp, Phi);
35054	if (MulOp.getOpcode() != ISD::MUL)
35055	return SDValue();
35056
35057	ShrinkMode Mode;
35058	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
35059	return SDValue();
35060
35061	EVT VT = N->getValueType(0);
35062
35063	unsigned RegSize = 128;
35064	if (Subtarget.hasBWI())
35065	RegSize = 512;
35066	else if (Subtarget.hasAVX2())
35067	RegSize = 256;
35068	unsigned VectorSize = VT.getVectorNumElements() * 16;
35069	// If the vector size is less than 128, or greater than the supported RegSize,
35070	// do not use PMADD.
35071	if (VectorSize < 128 \|\| VectorSize > RegSize)
35072	return SDValue();
35073
35074	SDLoc DL(N);
35075	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35076	VT.getVectorNumElements());
35077	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35078	VT.getVectorNumElements() / 2);
35079
35080	// Shrink the operands of mul.
35081	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35082	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35083
35084	// Madd vector size is half of the original vector size
35085	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35086	// Fill the rest of the output with 0
35087	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35088	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35089	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35090	}
35091
35092	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35093	const X86Subtarget &Subtarget) {
35094	SDLoc DL(N);
35095	EVT VT = N->getValueType(0);
35096	SDValue Op0 = N->getOperand(0);
35097	SDValue Op1 = N->getOperand(1);
35098
35099	// TODO: There's nothing special about i32, any integer type above i16 should
35100	// work just as well.
35101	if (!VT.isVector() \|\| !VT.isSimple() \|\|
35102	!(VT.getVectorElementType() == MVT::i32))
35103	return SDValue();
35104
35105	unsigned RegSize = 128;
35106	if (Subtarget.hasBWI())
35107	RegSize = 512;
35108	else if (Subtarget.hasAVX2())
35109	RegSize = 256;
35110
35111	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35112	// TODO: We should be able to handle larger vectors by splitting them before
35113	// feeding them into several SADs, and then reducing over those.
35114	if (VT.getSizeInBits() / 4 > RegSize)
35115	return SDValue();
35116
35117	// We know N is a reduction add, which means one of its operands is a phi.
35118	// To match SAD, we need the other operand to be a vector select.
35119	SDValue SelectOp, Phi;
35120	if (Op0.getOpcode() == ISD::VSELECT) {
35121	SelectOp = Op0;
35122	Phi = Op1;
35123	} else if (Op1.getOpcode() == ISD::VSELECT) {
35124	SelectOp = Op1;
35125	Phi = Op0;
35126	} else
35127	return SDValue();
35128
35129	// Check whether we have an abs-diff pattern feeding into the select.
35130	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35131	return SDValue();
35132
35133	// SAD pattern detected. Now build a SAD instruction and an addition for
35134	// reduction. Note that the number of elements of the result of SAD is less
35135	// than the number of elements of its input. Therefore, we could only update
35136	// part of elements in the reduction vector.
35137	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35138
35139	// The output of PSADBW is a vector of i64.
35140	// We need to turn the vector of i64 into a vector of i32.
35141	// If the reduction vector is at least as wide as the psadbw result, just
35142	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35143	// anyway.
35144	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35145	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35146	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35147	else
35148	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35149
35150	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35151	// Update part of elements of the reduction vector. This is done by first
35152	// extracting a sub-vector from it, updating this sub-vector, and inserting
35153	// it back.
35154	SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35155	DAG.getIntPtrConstant(0, DL));
35156	SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35157	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35158	DAG.getIntPtrConstant(0, DL));
35159	} else
35160	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35161	}
35162
35163	/// Convert vector increment or decrement to sub/add with an all-ones constant:
35164	/// add X, <1, 1...> --> sub X, <-1, -1...>
35165	/// sub X, <1, 1...> --> add X, <-1, -1...>
35166	/// The all-ones vector constant can be materialized using a pcmpeq instruction
35167	/// that is commonly recognized as an idiom (has no register dependency), so
35168	/// that's better/smaller than loading a splat 1 constant.
35169	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35170	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&(((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD:: SUB) && "Unexpected opcode for increment/decrement transform" ) ? static_cast<void> (0) : __assert_fail ("(N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 35171, __PRETTY_FUNCTION__))
35171	"Unexpected opcode for increment/decrement transform")(((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD:: SUB) && "Unexpected opcode for increment/decrement transform" ) ? static_cast<void> (0) : __assert_fail ("(N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 35171, __PRETTY_FUNCTION__));
35172
35173	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
35174	// out and wait for legalization if we have an unsupported vector length.
35175	EVT VT = N->getValueType(0);
35176	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35177	return SDValue();
35178
35179	SDNode *N1 = N->getOperand(1).getNode();
35180	APInt SplatVal;
35181	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\| !SplatVal.isOneValue())
35182	return SDValue();
35183
35184	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35185	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35186	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35187	}
35188
35189	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35190	const X86Subtarget &Subtarget) {
35191	const SDNodeFlags Flags = N->getFlags();
35192	if (Flags.hasVectorReduction()) {
35193	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35194	return Sad;
35195	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35196	return MAdd;
35197	}
35198	EVT VT = N->getValueType(0);
35199	SDValue Op0 = N->getOperand(0);
35200	SDValue Op1 = N->getOperand(1);
35201
35202	// Try to synthesize horizontal adds from adds of shuffles.
35203	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
35204	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
35205	isHorizontalBinOp(Op0, Op1, true))
35206	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35207
35208	if (SDValue V = combineIncDecVector(N, DAG))
35209	return V;
35210
35211	return combineAddOrSubToADCOrSBB(N, DAG);
35212	}
35213
35214	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35215	const X86Subtarget &Subtarget) {
35216	SDValue Op0 = N->getOperand(0);
35217	SDValue Op1 = N->getOperand(1);
35218
35219	// X86 can't encode an immediate LHS of a sub. See if we can push the
35220	// negation into a preceding instruction.
35221	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35222	// If the RHS of the sub is a XOR with one use and a constant, invert the
35223	// immediate. Then add one to the LHS of the sub so we can turn
35224	// X-Y -> X+~Y+1, saving one register.
35225	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35226	isa<ConstantSDNode>(Op1.getOperand(1))) {
35227	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35228	EVT VT = Op0.getValueType();
35229	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35230	Op1.getOperand(0),
35231	DAG.getConstant(~XorC, SDLoc(Op1), VT));
35232	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35233	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35234	}
35235	}
35236
35237	// Try to synthesize horizontal subs from subs of shuffles.
35238	EVT VT = N->getValueType(0);
35239	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
35240	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
35241	isHorizontalBinOp(Op0, Op1, false))
35242	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35243
35244	if (SDValue V = combineIncDecVector(N, DAG))
35245	return V;
35246
35247	return combineAddOrSubToADCOrSBB(N, DAG);
35248	}
35249
35250	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35251	TargetLowering::DAGCombinerInfo &DCI,
35252	const X86Subtarget &Subtarget) {
35253	if (DCI.isBeforeLegalize())
35254	return SDValue();
35255
35256	SDLoc DL(N);
35257	unsigned Opcode = N->getOpcode();
35258	MVT VT = N->getSimpleValueType(0);
35259	MVT SVT = VT.getVectorElementType();
35260	unsigned NumElts = VT.getVectorNumElements();
35261	unsigned EltSizeInBits = SVT.getSizeInBits();
35262
35263	SDValue Op = N->getOperand(0);
35264	MVT OpVT = Op.getSimpleValueType();
35265	MVT OpEltVT = OpVT.getVectorElementType();
35266	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35267	unsigned InputBits = OpEltSizeInBits * NumElts;
35268
35269	// Perform any constant folding.
35270	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35271	APInt UndefElts;
35272	SmallVector<APInt, 64> EltBits;
35273	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35274	APInt Undefs(NumElts, 0);
35275	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35276	bool IsZEXT =
35277	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35278	for (unsigned i = 0; i != NumElts; ++i) {
35279	if (UndefElts[i]) {
35280	Undefs.setBit(i);
35281	continue;
35282	}
35283	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35284	: EltBits[i].sextOrTrunc(EltSizeInBits);
35285	}
35286	return getConstVector(Vals, Undefs, VT, DAG, DL);
35287	}
35288
35289	// (vzext (bitcast (vzext (x)) -> (vzext x)
35290	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35291	SDValue V = peekThroughBitcasts(Op);
35292	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35293	MVT InnerVT = V.getSimpleValueType();
35294	MVT InnerEltVT = InnerVT.getVectorElementType();
35295
35296	// If the element sizes match exactly, we can just do one larger vzext. This
35297	// is always an exact type match as vzext operates on integer types.
35298	if (OpEltVT == InnerEltVT) {
35299	assert(OpVT == InnerVT && "Types must match for vzext!")((OpVT == InnerVT && "Types must match for vzext!") ? static_cast<void> (0) : __assert_fail ("OpVT == InnerVT && \"Types must match for vzext!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 35299, __PRETTY_FUNCTION__));
35300	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35301	}
35302
35303	// The only other way we can combine them is if only a single element of the
35304	// inner vzext is used in the input to the outer vzext.
35305	if (InnerEltVT.getSizeInBits() < InputBits)
35306	return SDValue();
35307
35308	// In this case, the inner vzext is completely dead because we're going to
35309	// only look at bits inside of the low element. Just do the outer vzext on
35310	// a bitcast of the input to the inner.
35311	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35312	}
35313
35314	// Check if we can bypass extracting and re-inserting an element of an input
35315	// vector. Essentially:
35316	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35317	// TODO: Add X86ISD::VSEXT support
35318	if (Opcode == X86ISD::VZEXT &&
35319	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35320	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35321	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35322	SDValue ExtractedV = V.getOperand(0);
35323	SDValue OrigV = ExtractedV.getOperand(0);
35324	if (isNullConstant(ExtractedV.getOperand(1))) {
35325	MVT OrigVT = OrigV.getSimpleValueType();
35326	// Extract a subvector if necessary...
35327	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35328	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35329	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35330	OrigVT.getVectorNumElements() / Ratio);
35331	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35332	DAG.getIntPtrConstant(0, DL));
35333	}
35334	Op = DAG.getBitcast(OpVT, OrigV);
35335	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35336	}
35337	}
35338
35339	return SDValue();
35340	}
35341
35342	/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35343	static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35344	const X86Subtarget &Subtarget) {
35345	SDValue Chain = N->getOperand(0);
35346	SDValue LHS = N->getOperand(1);
35347	SDValue RHS = N->getOperand(2);
35348	MVT VT = RHS.getSimpleValueType();
35349	SDLoc DL(N);
35350
35351	auto *C = dyn_cast<ConstantSDNode>(RHS);
35352	if (!C \|\| C->getZExtValue() != 1)
35353	return SDValue();
35354
35355	RHS = DAG.getConstant(-1, DL, VT);
35356	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35357	return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35358	DAG.getVTList(MVT::i32, MVT::Other),
35359	{Chain, LHS, RHS}, VT, MMO);
35360	}
35361
35362	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
35363	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35364	SDValue Op0 = N->getOperand(0);
35365	SDValue Op1 = N->getOperand(1);
35366
35367	if (Op0 != Op1 \|\| Op1->getOpcode() != ISD::AND)
35368	return SDValue();
35369
35370	EVT VT = N->getValueType(0);
35371	SDLoc DL(N);
35372
35373	return DAG.getNode(X86ISD::TESTM, DL, VT,
35374	Op0->getOperand(0), Op0->getOperand(1));
35375	}
35376
35377	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35378	const X86Subtarget &Subtarget) {
35379	MVT VT = N->getSimpleValueType(0);
35380	SDLoc DL(N);
35381
35382	if (N->getOperand(0) == N->getOperand(1)) {
35383	if (N->getOpcode() == X86ISD::PCMPEQ)
35384	return getOnesVector(VT, DAG, DL);
35385	if (N->getOpcode() == X86ISD::PCMPGT)
35386	return getZeroVector(VT, Subtarget, DAG, DL);
35387	}
35388
35389	return SDValue();
35390	}
35391
35392	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35393	TargetLowering::DAGCombinerInfo &DCI,
35394	const X86Subtarget &Subtarget) {
35395	if (DCI.isBeforeLegalizeOps())
35396	return SDValue();
35397
35398	SDLoc dl(N);
35399	SDValue Vec = N->getOperand(0);
35400	SDValue SubVec = N->getOperand(1);
35401	SDValue Idx = N->getOperand(2);
35402
35403	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35404	MVT OpVT = N->getSimpleValueType(0);
35405	MVT SubVecVT = SubVec.getSimpleValueType();
35406
35407	// If this is an insert of an extract, combine to a shuffle. Don't do this
35408	// if the insert or extract can be represented with a subvector operation.
35409	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35410	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35411	(IdxVal != 0 \|\| !Vec.isUndef())) {
35412	int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35413	if (ExtIdxVal != 0) {
35414	int VecNumElts = OpVT.getVectorNumElements();
35415	int SubVecNumElts = SubVecVT.getVectorNumElements();
35416	SmallVector<int, 64> Mask(VecNumElts);
35417	// First create an identity shuffle mask.
35418	for (int i = 0; i != VecNumElts; ++i)
35419	Mask[i] = i;
35420	// Now insert the extracted portion.
35421	for (int i = 0; i != SubVecNumElts; ++i)
35422	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35423
35424	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35425	}
35426	}
35427
35428	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35429	// load:
35430	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
35431	// (load16 addr + 16), Elts/2)
35432	// --> load32 addr
35433	// or:
35434	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
35435	// (load32 addr + 32), Elts/2)
35436	// --> load64 addr
35437	// or a 16-byte or 32-byte broadcast:
35438	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
35439	// (load16 addr), Elts/2)
35440	// --> X86SubVBroadcast(load16 addr)
35441	// or:
35442	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
35443	// (load32 addr), Elts/2)
35444	// --> X86SubVBroadcast(load32 addr)
35445	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35446	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35447	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35448	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35449	if (Idx2 && Idx2->getZExtValue() == 0) {
35450	SDValue SubVec2 = Vec.getOperand(1);
35451	// If needed, look through bitcasts to get to the load.
35452	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35453	bool Fast;
35454	unsigned Alignment = FirstLd->getAlignment();
35455	unsigned AS = FirstLd->getAddressSpace();
35456	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35457	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35458	OpVT, AS, Alignment, &Fast) && Fast) {
35459	SDValue Ops[] = {SubVec2, SubVec};
35460	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35461	Subtarget, false))
35462	return Ld;
35463	}
35464	}
35465	// If lower/upper loads are the same and the only users of the load, then
35466	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
35467	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35468	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35469	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35470	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35471	}
35472	}
35473	// If this is subv_broadcast insert into both halves, use a larger
35474	// subv_broadcast.
35475	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35476	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35477	SubVec.getOperand(0));
35478	}
35479	}
35480	}
35481
35482	return SDValue();
35483	}
35484
35485
35486	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35487	DAGCombinerInfo &DCI) const {
35488	SelectionDAG &DAG = DCI.DAG;
35489	switch (N->getOpcode()) {
35490	default: break;
35491	case ISD::EXTRACT_VECTOR_ELT:
35492	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35493	case X86ISD::PEXTRW:
35494	case X86ISD::PEXTRB:
35495	return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35496	case ISD::INSERT_SUBVECTOR:
35497	return combineInsertSubvector(N, DAG, DCI, Subtarget);
35498	case ISD::VSELECT:
35499	case ISD::SELECT:
35500	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35501	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
35502	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35503	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35504	case ISD::SUB: return combineSub(N, DAG, Subtarget);
35505	case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35506	case X86ISD::ADC: return combineADC(N, DAG, DCI);
35507	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35508	case ISD::SHL:
35509	case ISD::SRA:
35510	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35511	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35512	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35513	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35514	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35515	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35516	case ISD::STORE: return combineStore(N, DAG, Subtarget);
35517	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35518	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35519	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35520	case ISD::FADD:
35521	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35522	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35523	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35524	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35525	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35526	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35527	case X86ISD::FXOR:
35528	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35529	case X86ISD::FMIN:
35530	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35531	case ISD::FMINNUM:
35532	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35533	case X86ISD::BT: return combineBT(N, DAG, DCI);
35534	case ISD::ANY_EXTEND:
35535	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35536	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35537	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35538	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35539	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35540	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35541	case X86ISD::VSHLI:
35542	case X86ISD::VSRAI:
35543	case X86ISD::VSRLI:
35544	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35545	case ISD::SIGN_EXTEND_VECTOR_INREG:
35546	case ISD::ZERO_EXTEND_VECTOR_INREG:
35547	case X86ISD::VSEXT:
35548	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35549	case X86ISD::PINSRB:
35550	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35551	case X86ISD::SHUFP: // Handle all target specific shuffles
35552	case X86ISD::INSERTPS:
35553	case X86ISD::PALIGNR:
35554	case X86ISD::VSHLDQ:
35555	case X86ISD::VSRLDQ:
35556	case X86ISD::BLENDI:
35557	case X86ISD::UNPCKH:
35558	case X86ISD::UNPCKL:
35559	case X86ISD::MOVHLPS:
35560	case X86ISD::MOVLHPS:
35561	case X86ISD::PSHUFB:
35562	case X86ISD::PSHUFD:
35563	case X86ISD::PSHUFHW:
35564	case X86ISD::PSHUFLW:
35565	case X86ISD::MOVSHDUP:
35566	case X86ISD::MOVSLDUP:
35567	case X86ISD::MOVDDUP:
35568	case X86ISD::MOVSS:
35569	case X86ISD::MOVSD:
35570	case X86ISD::VPPERM:
35571	case X86ISD::VPERMI:
35572	case X86ISD::VPERMV:
35573	case X86ISD::VPERMV3:
35574	case X86ISD::VPERMIV3:
35575	case X86ISD::VPERMIL2:
35576	case X86ISD::VPERMILPI:
35577	case X86ISD::VPERMILPV:
35578	case X86ISD::VPERM2X128:
35579	case X86ISD::VZEXT_MOVL:
35580	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35581	case X86ISD::FMADD:
35582	case X86ISD::FMADD_RND:
35583	case X86ISD::FMADDS1_RND:
35584	case X86ISD::FMADDS3_RND:
35585	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35586	case ISD::MGATHER:
35587	case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35588	case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35589	case X86ISD::TESTM: return combineTestM(N, DAG);
35590	case X86ISD::PCMPEQ:
35591	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35592	}
35593
35594	return SDValue();
35595	}
35596
35597	/// Return true if the target has native support for the specified value type
35598	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
35599	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
35600	/// some i16 instructions are slow.
35601	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35602	if (!isTypeLegal(VT))
35603	return false;
35604	if (VT != MVT::i16)
35605	return true;
35606
35607	switch (Opc) {
35608	default:
35609	return true;
35610	case ISD::LOAD:
35611	case ISD::SIGN_EXTEND:
35612	case ISD::ZERO_EXTEND:
35613	case ISD::ANY_EXTEND:
35614	case ISD::SHL:
35615	case ISD::SRL:
35616	case ISD::SUB:
35617	case ISD::ADD:
35618	case ISD::MUL:
35619	case ISD::AND:
35620	case ISD::OR:
35621	case ISD::XOR:
35622	return false;
35623	}
35624	}
35625
35626	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35627	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35628	/// we don't adjust the stack we clobber the first frame index.
35629	/// See X86InstrInfo::copyPhysReg.
35630	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35631	const MachineRegisterInfo &MRI = MF.getRegInfo();
35632	return any_of(MRI.reg_instructions(X86::EFLAGS),
35633	[](const MachineInstr &RI) { return RI.isCopy(); });
35634	}
35635
35636	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35637	if (hasCopyImplyingStackAdjustment(MF)) {
35638	MachineFrameInfo &MFI = MF.getFrameInfo();
35639	MFI.setHasCopyImplyingStackAdjustment(true);
35640	}
35641
35642	TargetLoweringBase::finalizeLowering(MF);
35643	}
35644
35645	/// This method query the target whether it is beneficial for dag combiner to
35646	/// promote the specified node. If true, it should return the desired promotion
35647	/// type by reference.
35648	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35649	EVT VT = Op.getValueType();
35650	if (VT != MVT::i16)
35651	return false;
35652
35653	bool Promote = false;
35654	bool Commute = false;
35655	switch (Op.getOpcode()) {
35656	default: break;
35657	case ISD::SIGN_EXTEND:
35658	case ISD::ZERO_EXTEND:
35659	case ISD::ANY_EXTEND:
35660	Promote = true;
35661	break;
35662	case ISD::SHL:
35663	case ISD::SRL: {
35664	SDValue N0 = Op.getOperand(0);
35665	// Look out for (store (shl (load), x)).
35666	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35667	return false;
35668	Promote = true;
35669	break;
35670	}
35671	case ISD::ADD:
35672	case ISD::MUL:
35673	case ISD::AND:
35674	case ISD::OR:
35675	case ISD::XOR:
35676	Commute = true;
35677	LLVM_FALLTHROUGH[[clang::fallthrough]];
35678	case ISD::SUB: {
35679	SDValue N0 = Op.getOperand(0);
35680	SDValue N1 = Op.getOperand(1);
35681	if (!Commute && MayFoldLoad(N1))
35682	return false;
35683	// Avoid disabling potential load folding opportunities.
35684	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
35685	return false;
35686	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
35687	return false;
35688	Promote = true;
35689	}
35690	}
35691
35692	PVT = MVT::i32;
35693	return Promote;
35694	}
35695
35696	//===----------------------------------------------------------------------===//
35697	// X86 Inline Assembly Support
35698	//===----------------------------------------------------------------------===//
35699
35700	// Helper to match a string separated by whitespace.
35701	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35702	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35703
35704	for (StringRef Piece : Pieces) {
35705	if (!S.startswith(Piece)) // Check if the piece matches.
35706	return false;
35707
35708	S = S.substr(Piece.size());
35709	StringRef::size_type Pos = S.find_first_not_of(" \t");
35710	if (Pos == 0) // We matched a prefix.
35711	return false;
35712
35713	S = S.substr(Pos);
35714	}
35715
35716	return S.empty();
35717	}
35718
35719	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35720
35721	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
35722	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35723	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35724	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35725
35726	if (AsmPieces.size() == 3)
35727	return true;
35728	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35729	return true;
35730	}
35731	}
35732	return false;
35733	}
35734
35735	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35736	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35737
35738	const std::string &AsmStr = IA->getAsmString();
35739
35740	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35741	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
35742	return false;
35743
35744	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
35745	SmallVector<StringRef, 4> AsmPieces;
35746	SplitString(AsmStr, AsmPieces, ";\n");
35747
35748	switch (AsmPieces.size()) {
35749	default: return false;
35750	case 1:
35751	// FIXME: this should verify that we are targeting a 486 or better. If not,
35752	// we will turn this bswap into something that will be lowered to logical
35753	// ops instead of emitting the bswap asm. For now, we don't support 486 or
35754	// lower so don't worry about this.
35755	// bswap $0
35756	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
35757	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
35758	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
35759	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
35760	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
35761	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35762	// No need to check constraints, nothing other than the equivalent of
35763	// "=r,0" would be valid here.
35764	return IntrinsicLowering::LowerToByteSwap(CI);
35765	}
35766
35767	// rorw $$8, ${0:w} --> llvm.bswap.i16
35768	if (CI->getType()->isIntegerTy(16) &&
35769	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35770	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
35771	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35772	AsmPieces.clear();
35773	StringRef ConstraintsStr = IA->getConstraintString();
35774	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35775	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35776	if (clobbersFlagRegisters(AsmPieces))
35777	return IntrinsicLowering::LowerToByteSwap(CI);
35778	}
35779	break;
35780	case 3:
35781	if (CI->getType()->isIntegerTy(32) &&
35782	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35783	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35784	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35785	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35786	AsmPieces.clear();
35787	StringRef ConstraintsStr = IA->getConstraintString();
35788	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35789	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35790	if (clobbersFlagRegisters(AsmPieces))
35791	return IntrinsicLowering::LowerToByteSwap(CI);
35792	}
35793
35794	if (CI->getType()->isIntegerTy(64)) {
35795	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35796	if (Constraints.size() >= 2 &&
35797	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35798	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35799	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35800	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35801	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35802	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35803	return IntrinsicLowering::LowerToByteSwap(CI);
35804	}
35805	}
35806	break;
35807	}
35808	return false;
35809	}
35810
35811	/// Given a constraint letter, return the type of constraint for this target.
35812	X86TargetLowering::ConstraintType
35813	X86TargetLowering::getConstraintType(StringRef Constraint) const {
35814	if (Constraint.size() == 1) {
35815	switch (Constraint[0]) {
35816	case 'R':
35817	case 'q':
35818	case 'Q':
35819	case 'f':
35820	case 't':
35821	case 'u':
35822	case 'y':
35823	case 'x':
35824	case 'v':
35825	case 'Y':
35826	case 'l':
35827	return C_RegisterClass;
35828	case 'k': // AVX512 masking registers.
35829	case 'a':
35830	case 'b':
35831	case 'c':
35832	case 'd':
35833	case 'S':
35834	case 'D':
35835	case 'A':
35836	return C_Register;
35837	case 'I':
35838	case 'J':
35839	case 'K':
35840	case 'L':
35841	case 'M':
35842	case 'N':
35843	case 'G':
35844	case 'C':
35845	case 'e':
35846	case 'Z':
35847	return C_Other;
35848	default:
35849	break;
35850	}
35851	}
35852	else if (Constraint.size() == 2) {
35853	switch (Constraint[0]) {
35854	default:
35855	break;
35856	case 'Y':
35857	switch (Constraint[1]) {
35858	default:
35859	break;
35860	case 'k':
35861	return C_Register;
35862	}
35863	}
35864	}
35865	return TargetLowering::getConstraintType(Constraint);
35866	}
35867
35868	/// Examine constraint type and operand type and determine a weight value.
35869	/// This object must already have been set up with the operand type
35870	/// and the current alternative constraint selected.
35871	TargetLowering::ConstraintWeight
35872	X86TargetLowering::getSingleConstraintMatchWeight(
35873	AsmOperandInfo &info, const char *constraint) const {
35874	ConstraintWeight weight = CW_Invalid;
35875	Value *CallOperandVal = info.CallOperandVal;
35876	// If we don't have a value, we can't do a match,
35877	// but allow it at the lowest weight.
35878	if (!CallOperandVal)
35879	return CW_Default;
35880	Type *type = CallOperandVal->getType();
35881	// Look at the constraint type.
35882	switch (*constraint) {
35883	default:
35884	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35885	LLVM_FALLTHROUGH[[clang::fallthrough]];
35886	case 'R':
35887	case 'q':
35888	case 'Q':
35889	case 'a':
35890	case 'b':
35891	case 'c':
35892	case 'd':
35893	case 'S':
35894	case 'D':
35895	case 'A':
35896	if (CallOperandVal->getType()->isIntegerTy())
35897	weight = CW_SpecificReg;
35898	break;
35899	case 'f':
35900	case 't':
35901	case 'u':
35902	if (type->isFloatingPointTy())
35903	weight = CW_SpecificReg;
35904	break;
35905	case 'y':
35906	if (type->isX86_MMXTy() && Subtarget.hasMMX())
35907	weight = CW_SpecificReg;
35908	break;
35909	case 'Y':
35910	// Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35911	if (constraint[1] == 'k') {
35912	// Support for 'Yk' (similarly to the 'k' variant below).
35913	weight = CW_SpecificReg;
35914	break;
35915	}
35916	// Else fall through (handle "Y" constraint).
35917	LLVM_FALLTHROUGH[[clang::fallthrough]];
35918	case 'v':
35919	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35920	weight = CW_Register;
35921	LLVM_FALLTHROUGH[[clang::fallthrough]];
35922	case 'x':
35923	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
35924	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35925	weight = CW_Register;
35926	break;
35927	case 'k':
35928	// Enable conditional vector operations using %k<#> registers.
35929	weight = CW_SpecificReg;
35930	break;
35931	case 'I':
35932	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35933	if (C->getZExtValue() <= 31)
35934	weight = CW_Constant;
35935	}
35936	break;
35937	case 'J':
35938	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35939	if (C->getZExtValue() <= 63)
35940	weight = CW_Constant;
35941	}
35942	break;
35943	case 'K':
35944	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35945	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35946	weight = CW_Constant;
35947	}
35948	break;
35949	case 'L':
35950	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35951	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
35952	weight = CW_Constant;
35953	}
35954	break;
35955	case 'M':
35956	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35957	if (C->getZExtValue() <= 3)
35958	weight = CW_Constant;
35959	}
35960	break;
35961	case 'N':
35962	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35963	if (C->getZExtValue() <= 0xff)
35964	weight = CW_Constant;
35965	}
35966	break;
35967	case 'G':
35968	case 'C':
35969	if (isa<ConstantFP>(CallOperandVal)) {
35970	weight = CW_Constant;
35971	}
35972	break;
35973	case 'e':
35974	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35975	if ((C->getSExtValue() >= -0x80000000LL) &&
35976	(C->getSExtValue() <= 0x7fffffffLL))
35977	weight = CW_Constant;
35978	}
35979	break;
35980	case 'Z':
35981	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35982	if (C->getZExtValue() <= 0xffffffff)
35983	weight = CW_Constant;
35984	}
35985	break;
35986	}
35987	return weight;
35988	}
35989
35990	/// Try to replace an X constraint, which matches anything, with another that
35991	/// has more specific requirements based on the type of the corresponding
35992	/// operand.
35993	const char *X86TargetLowering::
35994	LowerXConstraint(EVT ConstraintVT) const {
35995	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
35996	// 'f' like normal targets.
35997	if (ConstraintVT.isFloatingPoint()) {
35998	if (Subtarget.hasSSE2())
35999	return "Y";
36000	if (Subtarget.hasSSE1())
36001	return "x";
36002	}
36003
36004	return TargetLowering::LowerXConstraint(ConstraintVT);
36005	}
36006
36007	/// Lower the specified operand into the Ops vector.
36008	/// If it is invalid, don't add anything to Ops.
36009	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36010	std::string &Constraint,
36011	std::vector<SDValue>&Ops,
36012	SelectionDAG &DAG) const {
36013	SDValue Result;
36014
36015	// Only support length 1 constraints for now.
36016	if (Constraint.length() > 1) return;
36017
36018	char ConstraintLetter = Constraint[0];
36019	switch (ConstraintLetter) {
36020	default: break;
36021	case 'I':
36022	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36023	if (C->getZExtValue() <= 31) {
36024	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36025	Op.getValueType());
36026	break;
36027	}
36028	}
36029	return;
36030	case 'J':
36031	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36032	if (C->getZExtValue() <= 63) {
36033	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36034	Op.getValueType());
36035	break;
36036	}
36037	}
36038	return;
36039	case 'K':
36040	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36041	if (isInt<8>(C->getSExtValue())) {
36042	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36043	Op.getValueType());
36044	break;
36045	}
36046	}
36047	return;
36048	case 'L':
36049	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36050	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
36051	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36052	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36053	Op.getValueType());
36054	break;
36055	}
36056	}
36057	return;
36058	case 'M':
36059	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36060	if (C->getZExtValue() <= 3) {
36061	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36062	Op.getValueType());
36063	break;
36064	}
36065	}
36066	return;
36067	case 'N':
36068	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36069	if (C->getZExtValue() <= 255) {
36070	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36071	Op.getValueType());
36072	break;
36073	}
36074	}
36075	return;
36076	case 'O':
36077	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36078	if (C->getZExtValue() <= 127) {
36079	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36080	Op.getValueType());
36081	break;
36082	}
36083	}
36084	return;
36085	case 'e': {
36086	// 32-bit signed value
36087	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36088	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36089	C->getSExtValue())) {
36090	// Widen to 64 bits here to get it sign extended.
36091	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36092	break;
36093	}
36094	// FIXME gcc accepts some relocatable values here too, but only in certain
36095	// memory models; it's complicated.
36096	}
36097	return;
36098	}
36099	case 'Z': {
36100	// 32-bit unsigned value
36101	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36102	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36103	C->getZExtValue())) {
36104	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36105	Op.getValueType());
36106	break;
36107	}
36108	}
36109	// FIXME gcc accepts some relocatable values here too, but only in certain
36110	// memory models; it's complicated.
36111	return;
36112	}
36113	case 'i': {
36114	// Literal immediates are always ok.
36115	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36116	// Widen to 64 bits here to get it sign extended.
36117	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36118	break;
36119	}
36120
36121	// In any sort of PIC mode addresses need to be computed at runtime by
36122	// adding in a register or some sort of table lookup. These can't
36123	// be used as immediates.
36124	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
36125	return;
36126
36127	// If we are in non-pic codegen mode, we allow the address of a global (with
36128	// an optional displacement) to be used with 'i'.
36129	GlobalAddressSDNode *GA = nullptr;
36130	int64_t Offset = 0;
36131
36132	// Match either (GA), (GA+C), (GA+C1+C2), etc.
36133	while (1) {
36134	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36135	Offset += GA->getOffset();
36136	break;
36137	} else if (Op.getOpcode() == ISD::ADD) {
36138	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36139	Offset += C->getZExtValue();
36140	Op = Op.getOperand(0);
36141	continue;
36142	}
36143	} else if (Op.getOpcode() == ISD::SUB) {
36144	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36145	Offset += -C->getZExtValue();
36146	Op = Op.getOperand(0);
36147	continue;
36148	}
36149	}
36150
36151	// Otherwise, this isn't something we can handle, reject it.
36152	return;
36153	}
36154
36155	const GlobalValue *GV = GA->getGlobal();
36156	// If we require an extra load to get this address, as in PIC mode, we
36157	// can't accept it.
36158	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36159	return;
36160
36161	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36162	GA->getValueType(0), Offset);
36163	break;
36164	}
36165	}
36166
36167	if (Result.getNode()) {
36168	Ops.push_back(Result);
36169	return;
36170	}
36171	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36172	}
36173
36174	/// Check if \p RC is a general purpose register class.
36175	/// I.e., GR* or one of their variant.
36176	static bool isGRClass(const TargetRegisterClass &RC) {
36177	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
36178	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
36179	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
36180	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
36181	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36182	}
36183
36184	/// Check if \p RC is a vector register class.
36185	/// I.e., FR* / VR* or one of their variant.
36186	static bool isFRClass(const TargetRegisterClass &RC) {
36187	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
36188	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
36189	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
36190	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
36191	RC.hasSuperClassEq(&X86::VR512RegClass);
36192	}
36193
36194	std::pair<unsigned, const TargetRegisterClass *>
36195	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36196	StringRef Constraint,
36197	MVT VT) const {
36198	// First, see if this is a constraint that directly corresponds to an LLVM
36199	// register class.
36200	if (Constraint.size() == 1) {
36201	// GCC Constraint Letters
36202	switch (Constraint[0]) {
36203	default: break;
36204	// TODO: Slight differences here in allocation order and leaving
36205	// RIP in the class. Do they matter any more here than they do
36206	// in the normal allocation?
36207	case 'k':
36208	if (Subtarget.hasAVX512()) {
36209	// Only supported in AVX512 or later.
36210	switch (VT.SimpleTy) {
36211	default: break;
36212	case MVT::i32:
36213	return std::make_pair(0U, &X86::VK32RegClass);
36214	case MVT::i16:
36215	return std::make_pair(0U, &X86::VK16RegClass);
36216	case MVT::i8:
36217	return std::make_pair(0U, &X86::VK8RegClass);
36218	case MVT::i1:
36219	return std::make_pair(0U, &X86::VK1RegClass);
36220	case MVT::i64:
36221	return std::make_pair(0U, &X86::VK64RegClass);
36222	}
36223	}
36224	break;
36225	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36226	if (Subtarget.is64Bit()) {
36227	if (VT == MVT::i32 \|\| VT == MVT::f32)
36228	return std::make_pair(0U, &X86::GR32RegClass);
36229	if (VT == MVT::i16)
36230	return std::make_pair(0U, &X86::GR16RegClass);
36231	if (VT == MVT::i8 \|\| VT == MVT::i1)
36232	return std::make_pair(0U, &X86::GR8RegClass);
36233	if (VT == MVT::i64 \|\| VT == MVT::f64)
36234	return std::make_pair(0U, &X86::GR64RegClass);
36235	break;
36236	}
36237	LLVM_FALLTHROUGH[[clang::fallthrough]];
36238	// 32-bit fallthrough
36239	case 'Q': // Q_REGS
36240	if (VT == MVT::i32 \|\| VT == MVT::f32)
36241	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36242	if (VT == MVT::i16)
36243	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36244	if (VT == MVT::i8 \|\| VT == MVT::i1)
36245	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36246	if (VT == MVT::i64)
36247	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36248	break;
36249	case 'r': // GENERAL_REGS
36250	case 'l': // INDEX_REGS
36251	if (VT == MVT::i8 \|\| VT == MVT::i1)
36252	return std::make_pair(0U, &X86::GR8RegClass);
36253	if (VT == MVT::i16)
36254	return std::make_pair(0U, &X86::GR16RegClass);
36255	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
36256	return std::make_pair(0U, &X86::GR32RegClass);
36257	return std::make_pair(0U, &X86::GR64RegClass);
36258	case 'R': // LEGACY_REGS
36259	if (VT == MVT::i8 \|\| VT == MVT::i1)
36260	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36261	if (VT == MVT::i16)
36262	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36263	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
36264	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36265	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36266	case 'f': // FP Stack registers.
36267	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
36268	// value to the correct fpstack register class.
36269	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36270	return std::make_pair(0U, &X86::RFP32RegClass);
36271	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36272	return std::make_pair(0U, &X86::RFP64RegClass);
36273	return std::make_pair(0U, &X86::RFP80RegClass);
36274	case 'y': // MMX_REGS if MMX allowed.
36275	if (!Subtarget.hasMMX()) break;
36276	return std::make_pair(0U, &X86::VR64RegClass);
36277	case 'Y': // SSE_REGS if SSE2 allowed
36278	if (!Subtarget.hasSSE2()) break;
36279	LLVM_FALLTHROUGH[[clang::fallthrough]];
36280	case 'v':
36281	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36282	if (!Subtarget.hasSSE1()) break;
36283	bool VConstraint = (Constraint[0] == 'v');
36284
36285	switch (VT.SimpleTy) {
36286	default: break;
36287	// Scalar SSE types.
36288	case MVT::f32:
36289	case MVT::i32:
36290	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36291	return std::make_pair(0U, &X86::FR32XRegClass);
36292	return std::make_pair(0U, &X86::FR32RegClass);
36293	case MVT::f64:
36294	case MVT::i64:
36295	if (VConstraint && Subtarget.hasVLX())
36296	return std::make_pair(0U, &X86::FR64XRegClass);
36297	return std::make_pair(0U, &X86::FR64RegClass);
36298	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36299	// Vector types.
36300	case MVT::v16i8:
36301	case MVT::v8i16:
36302	case MVT::v4i32:
36303	case MVT::v2i64:
36304	case MVT::v4f32:
36305	case MVT::v2f64:
36306	if (VConstraint && Subtarget.hasVLX())
36307	return std::make_pair(0U, &X86::VR128XRegClass);
36308	return std::make_pair(0U, &X86::VR128RegClass);
36309	// AVX types.
36310	case MVT::v32i8:
36311	case MVT::v16i16:
36312	case MVT::v8i32:
36313	case MVT::v4i64:
36314	case MVT::v8f32:
36315	case MVT::v4f64:
36316	if (VConstraint && Subtarget.hasVLX())
36317	return std::make_pair(0U, &X86::VR256XRegClass);
36318	return std::make_pair(0U, &X86::VR256RegClass);
36319	case MVT::v8f64:
36320	case MVT::v16f32:
36321	case MVT::v16i32:
36322	case MVT::v8i64:
36323	return std::make_pair(0U, &X86::VR512RegClass);
36324	}
36325	break;
36326	}
36327	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36328	switch (Constraint[1]) {
36329	default:
36330	break;
36331	case 'k':
36332	// This register class doesn't allocate k0 for masked vector operation.
36333	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36334	switch (VT.SimpleTy) {
36335	default: break;
36336	case MVT::i32:
36337	return std::make_pair(0U, &X86::VK32WMRegClass);
36338	case MVT::i16:
36339	return std::make_pair(0U, &X86::VK16WMRegClass);
36340	case MVT::i8:
36341	return std::make_pair(0U, &X86::VK8WMRegClass);
36342	case MVT::i1:
36343	return std::make_pair(0U, &X86::VK1WMRegClass);
36344	case MVT::i64:
36345	return std::make_pair(0U, &X86::VK64WMRegClass);
36346	}
36347	}
36348	break;
36349	}
36350	}
36351
36352	// Use the default implementation in TargetLowering to convert the register
36353	// constraint into a member of a register class.
36354	std::pair<unsigned, const TargetRegisterClass*> Res;
36355	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36356
36357	// Not found as a standard register?
36358	if (!Res.second) {
36359	// Map st(0) -> st(7) -> ST0
36360	if (Constraint.size() == 7 && Constraint[0] == '{' &&
36361	tolower(Constraint[1]) == 's' &&
36362	tolower(Constraint[2]) == 't' &&
36363	Constraint[3] == '(' &&
36364	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
36365	Constraint[5] == ')' &&
36366	Constraint[6] == '}') {
36367
36368	Res.first = X86::FP0+Constraint[4]-'0';
36369	Res.second = &X86::RFP80RegClass;
36370	return Res;
36371	}
36372
36373	// GCC allows "st(0)" to be called just plain "st".
36374	if (StringRef("{st}").equals_lower(Constraint)) {
36375	Res.first = X86::FP0;
36376	Res.second = &X86::RFP80RegClass;
36377	return Res;
36378	}
36379
36380	// flags -> EFLAGS
36381	if (StringRef("{flags}").equals_lower(Constraint)) {
36382	Res.first = X86::EFLAGS;
36383	Res.second = &X86::CCRRegClass;
36384	return Res;
36385	}
36386
36387	// 'A' means [ER]AX + [ER]DX.
36388	if (Constraint == "A") {
36389	if (Subtarget.is64Bit()) {
36390	Res.first = X86::RAX;
36391	Res.second = &X86::GR64_ADRegClass;
36392	} else {
36393	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&(((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() \|\| Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36394, __PRETTY_FUNCTION__))
36394	"Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget" ) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() \|\| Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36394, __PRETTY_FUNCTION__));
36395	Res.first = X86::EAX;
36396	Res.second = &X86::GR32_ADRegClass;
36397	}
36398	return Res;
36399	}
36400	return Res;
36401	}
36402
36403	// Otherwise, check to see if this is a register class of the wrong value
36404	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36405	// turn into {ax},{dx}.
36406	// MVT::Other is used to specify clobber names.
36407	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
36408	return Res; // Correct type already, nothing to do.
36409
36410	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36411	// return "eax". This should even work for things like getting 64bit integer
36412	// registers when given an f64 type.
36413	const TargetRegisterClass *Class = Res.second;
36414	// The generic code will match the first register class that contains the
36415	// given register. Thus, based on the ordering of the tablegened file,
36416	// the "plain" GR classes might not come first.
36417	// Therefore, use a helper method.
36418	if (isGRClass(*Class)) {
36419	unsigned Size = VT.getSizeInBits();
36420	if (Size == 1) Size = 8;
36421	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36422	if (DestReg > 0) {
36423	Res.first = DestReg;
36424	Res.second = Size == 8 ? &X86::GR8RegClass
36425	: Size == 16 ? &X86::GR16RegClass
36426	: Size == 32 ? &X86::GR32RegClass
36427	: &X86::GR64RegClass;
36428	assert(Res.second->contains(Res.first) && "Register in register class")((Res.second->contains(Res.first) && "Register in register class" ) ? static_cast<void> (0) : __assert_fail ("Res.second->contains(Res.first) && \"Register in register class\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36428, __PRETTY_FUNCTION__));
36429	} else {
36430	// No register found/type mismatch.
36431	Res.first = 0;
36432	Res.second = nullptr;
36433	}
36434	} else if (isFRClass(*Class)) {
36435	// Handle references to XMM physical registers that got mapped into the
36436	// wrong class. This can happen with constraints like {xmm0} where the
36437	// target independent register mapper will just pick the first match it can
36438	// find, ignoring the required type.
36439
36440	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36441	if (VT == MVT::f32 \|\| VT == MVT::i32)
36442	Res.second = &X86::FR32RegClass;
36443	else if (VT == MVT::f64 \|\| VT == MVT::i64)
36444	Res.second = &X86::FR64RegClass;
36445	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36446	Res.second = &X86::VR128RegClass;
36447	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36448	Res.second = &X86::VR256RegClass;
36449	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36450	Res.second = &X86::VR512RegClass;
36451	else {
36452	// Type mismatch and not a clobber: Return an error;
36453	Res.first = 0;
36454	Res.second = nullptr;
36455	}
36456	}
36457
36458	return Res;
36459	}
36460
36461	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36462	const AddrMode &AM, Type *Ty,
36463	unsigned AS) const {
36464	// Scaling factors are not free at all.
36465	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36466	// will take 2 allocations in the out of order engine instead of 1
36467	// for plain addressing mode, i.e. inst (reg1).
36468	// E.g.,
36469	// vaddps (%rsi,%drx), %ymm0, %ymm1
36470	// Requires two allocations (one for the load, one for the computation)
36471	// whereas:
36472	// vaddps (%rsi), %ymm0, %ymm1
36473	// Requires just 1 allocation, i.e., freeing allocations for other operations
36474	// and having less micro operations to execute.
36475	//
36476	// For some X86 architectures, this is even worse because for instance for
36477	// stores, the complex addressing mode forces the instruction to use the
36478	// "load" ports instead of the dedicated "store" port.
36479	// E.g., on Haswell:
36480	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36481	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36482	if (isLegalAddressingMode(DL, AM, Ty, AS))
36483	// Scale represents reg2 * scale, thus account for 1
36484	// as soon as we use a second register.
36485	return AM.Scale != 0;
36486	return -1;
36487	}
36488
36489	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36490	// Integer division on x86 is expensive. However, when aggressively optimizing
36491	// for code size, we prefer to use a div instruction, as it is usually smaller
36492	// than the alternative sequence.
36493	// The exception to this is vector division. Since x86 doesn't have vector
36494	// integer division, leaving the division as-is is a loss even in terms of
36495	// size, because it will have to be scalarized, while the alternative code
36496	// sequence can be performed in vector form.
36497	bool OptSize =
36498	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36499	return OptSize && !VT.isVector();
36500	}
36501
36502	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36503	if (!Subtarget.is64Bit())
36504	return;
36505
36506	// Update IsSplitCSR in X86MachineFunctionInfo.
36507	X86MachineFunctionInfo *AFI =
36508	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36509	AFI->setIsSplitCSR(true);
36510	}
36511
36512	void X86TargetLowering::insertCopiesSplitCSR(
36513	MachineBasicBlock *Entry,
36514	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36515	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36516	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36517	if (!IStart)
36518	return;
36519
36520	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36521	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36522	MachineBasicBlock::iterator MBBI = Entry->begin();
36523	for (const MCPhysReg I = IStart; I; ++I) {
36524	const TargetRegisterClass *RC = nullptr;
36525	if (X86::GR64RegClass.contains(*I))
36526	RC = &X86::GR64RegClass;
36527	else
36528	llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36528);
36529
36530	unsigned NewVR = MRI->createVirtualRegister(RC);
36531	// Create copy from CSR to a virtual register.
36532	// FIXME: this currently does not emit CFI pseudo-instructions, it works
36533	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36534	// nounwind. If we want to generalize this later, we may need to emit
36535	// CFI pseudo-instructions.
36536	assert(Entry->getParent()->getFunction()->hasFnAttribute(((Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36538, __PRETTY_FUNCTION__))
36537	Attribute::NoUnwind) &&((Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36538, __PRETTY_FUNCTION__))
36538	"Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Target/X86/X86ISelLowering.cpp" , 36538, __PRETTY_FUNCTION__));
36539	Entry->addLiveIn(*I);
36540	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36541	.addReg(*I);
36542
36543	// Insert the copy-back instructions right before the terminator.
36544	for (auto *Exit : Exits)
36545	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36546	TII->get(TargetOpcode::COPY), *I)
36547	.addReg(NewVR);
36548	}
36549	}
36550
36551	bool X86TargetLowering::supportSwiftError() const {
36552	return Subtarget.is64Bit();
36553	}
36554
36555	/// Returns the name of the symbol used to emit stack probes or the empty
36556	/// string if not applicable.
36557	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36558	// If the function specifically requests stack probes, emit them.
36559	if (MF.getFunction()->hasFnAttribute("probe-stack"))
36560	return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36561
36562	// Generally, if we aren't on Windows, the platform ABI does not include
36563	// support for stack probes, so don't emit them.
36564	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
36565	return "";
36566
36567	// We need a stack probe to conform to the Windows ABI. Choose the right
36568	// symbol.
36569	if (Subtarget.is64Bit())
36570	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36571	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
36572	}