Bug Summary

File:lib/Target/X86/X86ISelLowering.cpp
Location:line 6075, column 33
Description:Called C++ object pointer is null

Annotated Source Code

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
16#include "Utils/X86ShuffleDecode.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86TargetMachine.h"
22#include "X86TargetObjectFile.h"
23#include "llvm/ADT/SmallBitVector.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringExtras.h"
27#include "llvm/ADT/StringSwitch.h"
28#include "llvm/ADT/VariadicFunction.h"
29#include "llvm/CodeGen/IntrinsicLowering.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineJumpTableInfo.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/IR/CallSite.h"
37#include "llvm/IR/CallingConv.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DerivedTypes.h"
40#include "llvm/IR/Function.h"
41#include "llvm/IR/GlobalAlias.h"
42#include "llvm/IR/GlobalVariable.h"
43#include "llvm/IR/Instructions.h"
44#include "llvm/IR/Intrinsics.h"
45#include "llvm/MC/MCAsmInfo.h"
46#include "llvm/MC/MCContext.h"
47#include "llvm/MC/MCExpr.h"
48#include "llvm/MC/MCSymbol.h"
49#include "llvm/Support/CommandLine.h"
50#include "llvm/Support/Debug.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/MathExtras.h"
53#include "llvm/Target/TargetOptions.h"
54#include "X86IntrinsicsInfo.h"
55#include <bitset>
56#include <numeric>
57#include <cctype>
58using namespace llvm;
59
60#define DEBUG_TYPE"x86-isel" "x86-isel"
61
62STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = { "x86-isel", "Number of tail calls"
, 0, 0 }
;
63
64static cl::opt<bool> ExperimentalVectorWideningLegalization(
65 "x86-experimental-vector-widening-legalization", cl::init(false),
66 cl::desc("Enable an experimental vector type legalization through widening "
67 "rather than promotion."),
68 cl::Hidden);
69
70static cl::opt<bool> ExperimentalVectorShuffleLowering(
71 "x86-experimental-vector-shuffle-lowering", cl::init(true),
72 cl::desc("Enable an experimental vector shuffle lowering code path."),
73 cl::Hidden);
74
75static cl::opt<bool> ExperimentalVectorShuffleLegality(
76 "x86-experimental-vector-shuffle-legality", cl::init(false),
77 cl::desc("Enable experimental shuffle legality based on the experimental "
78 "shuffle lowering. Should only be used with the experimental "
79 "shuffle lowering."),
80 cl::Hidden);
81
82static cl::opt<int> ReciprocalEstimateRefinementSteps(
83 "x86-recip-refinement-steps", cl::init(1),
84 cl::desc("Specify the number of Newton-Raphson iterations applied to the "
85 "result of the hardware reciprocal estimate instruction."),
86 cl::NotHidden);
87
88// Forward declarations.
89static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
90 SDValue V2);
91
92static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
93 SelectionDAG &DAG, SDLoc dl,
94 unsigned vectorWidth) {
95 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 96, __PRETTY_FUNCTION__))
96 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 96, __PRETTY_FUNCTION__))
;
97 EVT VT = Vec.getValueType();
98 EVT ElVT = VT.getVectorElementType();
99 unsigned Factor = VT.getSizeInBits()/vectorWidth;
100 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
101 VT.getVectorNumElements()/Factor);
102
103 // Extract from UNDEF is UNDEF.
104 if (Vec.getOpcode() == ISD::UNDEF)
105 return DAG.getUNDEF(ResultVT);
106
107 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
108 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
109
110 // This is the index of the first element of the vectorWidth-bit chunk
111 // we want.
112 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
113 * ElemsPerChunk);
114
115 // If the input is a buildvector just emit a smaller one.
116 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
117 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
118 makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
119 ElemsPerChunk));
120
121 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
123}
124
125/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
126/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
127/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
128/// instructions or a simple subregister reference. Idx is an index in the
129/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
130/// lowering EXTRACT_VECTOR_ELT operations easier.
131static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
132 SelectionDAG &DAG, SDLoc dl) {
133 assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 134, __PRETTY_FUNCTION__))
134 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 134, __PRETTY_FUNCTION__))
;
135 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
136}
137
138/// Generate a DAG to grab 256-bits from a 512-bit vector.
139static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
140 SelectionDAG &DAG, SDLoc dl) {
141 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 141, __PRETTY_FUNCTION__))
;
142 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
143}
144
145static SDValue InsertSubVector(SDValue Result, SDValue Vec,
146 unsigned IdxVal, SelectionDAG &DAG,
147 SDLoc dl, unsigned vectorWidth) {
148 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 149, __PRETTY_FUNCTION__))
149 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 149, __PRETTY_FUNCTION__))
;
150 // Inserting UNDEF is Result
151 if (Vec.getOpcode() == ISD::UNDEF)
152 return Result;
153 EVT VT = Vec.getValueType();
154 EVT ElVT = VT.getVectorElementType();
155 EVT ResultVT = Result.getValueType();
156
157 // Insert the relevant vectorWidth bits.
158 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
159
160 // This is the index of the first element of the vectorWidth-bit chunk
161 // we want.
162 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
163 * ElemsPerChunk);
164
165 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
166 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
167}
168
169/// Generate a DAG to put 128-bits into a vector > 128 bits. This
170/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
171/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
172/// simple superregister reference. Idx is an index in the 128 bits
173/// we want. It need not be aligned to a 128-bit boundary. That makes
174/// lowering INSERT_VECTOR_ELT operations easier.
175static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
176 SelectionDAG &DAG,SDLoc dl) {
177 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 177, __PRETTY_FUNCTION__))
;
178 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
179}
180
181static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
182 SelectionDAG &DAG, SDLoc dl) {
183 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!")((Vec.getValueType().is256BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is256BitVector() && \"Unexpected vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 183, __PRETTY_FUNCTION__))
;
184 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
185}
186
187/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
188/// instructions. This is used because creating CONCAT_VECTOR nodes of
189/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
190/// large BUILD_VECTORS.
191static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
192 unsigned NumElems, SelectionDAG &DAG,
193 SDLoc dl) {
194 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
195 return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
196}
197
198static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
199 unsigned NumElems, SelectionDAG &DAG,
200 SDLoc dl) {
201 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
202 return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
203}
204
205X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
206 : TargetLowering(TM) {
207 Subtarget = &TM.getSubtarget<X86Subtarget>();
208 X86ScalarSSEf64 = Subtarget->hasSSE2();
209 X86ScalarSSEf32 = Subtarget->hasSSE1();
210 TD = getDataLayout();
211
212 // Set up the TargetLowering object.
213 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
214
215 // X86 is weird. It always uses i8 for shift amounts and setcc results.
216 setBooleanContents(ZeroOrOneBooleanContent);
217 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
218 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
219
220 // For 64-bit, since we have so many registers, use the ILP scheduler.
221 // For 32-bit, use the register pressure specific scheduling.
222 // For Atom, always use ILP scheduling.
223 if (Subtarget->isAtom())
224 setSchedulingPreference(Sched::ILP);
225 else if (Subtarget->is64Bit())
226 setSchedulingPreference(Sched::ILP);
227 else
228 setSchedulingPreference(Sched::RegPressure);
229 const X86RegisterInfo *RegInfo =
230 TM.getSubtarget<X86Subtarget>().getRegisterInfo();
231 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
232
233 // Bypass expensive divides on Atom when compiling with O2.
234 if (TM.getOptLevel() >= CodeGenOpt::Default) {
235 if (Subtarget->hasSlowDivide32())
236 addBypassSlowDiv(32, 8);
237 if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
238 addBypassSlowDiv(64, 16);
239 }
240
241 if (Subtarget->isTargetKnownWindowsMSVC()) {
242 // Setup Windows compiler runtime calls.
243 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
244 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
245 setLibcallName(RTLIB::SREM_I64, "_allrem");
246 setLibcallName(RTLIB::UREM_I64, "_aullrem");
247 setLibcallName(RTLIB::MUL_I64, "_allmul");
248 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
249 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
250 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
251 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
252 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
253
254 // The _ftol2 runtime function has an unusual calling conv, which
255 // is modeled by a special pseudo-instruction.
256 setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
257 setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
258 setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
259 setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
260 }
261
262 if (Subtarget->isTargetDarwin()) {
263 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
264 setUseUnderscoreSetJmp(false);
265 setUseUnderscoreLongJmp(false);
266 } else if (Subtarget->isTargetWindowsGNU()) {
267 // MS runtime is weird: it exports _setjmp, but longjmp!
268 setUseUnderscoreSetJmp(true);
269 setUseUnderscoreLongJmp(false);
270 } else {
271 setUseUnderscoreSetJmp(true);
272 setUseUnderscoreLongJmp(true);
273 }
274
275 // Set up the register classes.
276 addRegisterClass(MVT::i8, &X86::GR8RegClass);
277 addRegisterClass(MVT::i16, &X86::GR16RegClass);
278 addRegisterClass(MVT::i32, &X86::GR32RegClass);
279 if (Subtarget->is64Bit())
280 addRegisterClass(MVT::i64, &X86::GR64RegClass);
281
282 for (MVT VT : MVT::integer_valuetypes())
283 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
284
285 // We don't accept any truncstore of integer registers.
286 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
288 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
289 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
290 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
291 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
292
293 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
294
295 // SETOEQ and SETUNE require checking two conditions.
296 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
297 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
298 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
299 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
300 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
301 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
302
303 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
304 // operation.
305 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
306 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
307 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
308
309 if (Subtarget->is64Bit()) {
310 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
311 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
312 } else if (!TM.Options.UseSoftFloat) {
313 // We have an algorithm for SSE2->double, and we turn this into a
314 // 64-bit FILD followed by conditional FADD for other targets.
315 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
316 // We have an algorithm for SSE2, and we turn this into a 64-bit
317 // FILD for other targets.
318 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
319 }
320
321 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
322 // this operation.
323 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
324 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
325
326 if (!TM.Options.UseSoftFloat) {
327 // SSE has no i16 to fp conversion, only i32
328 if (X86ScalarSSEf32) {
329 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
330 // f32 and f64 cases are Legal, f80 case is not
331 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
332 } else {
333 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
334 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
335 }
336 } else {
337 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
338 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
339 }
340
341 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
342 // are Legal, f80 is custom lowered.
343 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
344 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
345
346 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
347 // this operation.
348 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
349 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
350
351 if (X86ScalarSSEf32) {
352 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
353 // f32 and f64 cases are Legal, f80 case is not
354 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
355 } else {
356 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
357 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
358 }
359
360 // Handle FP_TO_UINT by promoting the destination to a larger signed
361 // conversion.
362 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
363 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
364 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
365
366 if (Subtarget->is64Bit()) {
367 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
368 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
369 } else if (!TM.Options.UseSoftFloat) {
370 // Since AVX is a superset of SSE3, only check for SSE here.
371 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
372 // Expand FP_TO_UINT into a select.
373 // FIXME: We would like to use a Custom expander here eventually to do
374 // the optimal thing for SSE vs. the default expansion in the legalizer.
375 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
376 else
377 // With SSE3 we can use fisttpll to convert to a signed i64; without
378 // SSE, we're stuck with a fistpll.
379 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
380 }
381
382 if (isTargetFTOL()) {
383 // Use the _ftol2 runtime function, which has a pseudo-instruction
384 // to handle its weird calling convention.
385 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
386 }
387
388 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
389 if (!X86ScalarSSEf64) {
390 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
391 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
392 if (Subtarget->is64Bit()) {
393 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
394 // Without SSE, i64->f64 goes through memory.
395 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
396 }
397 }
398
399 // Scalar integer divide and remainder are lowered to use operations that
400 // produce two results, to match the available instructions. This exposes
401 // the two-result form to trivial CSE, which is able to combine x/y and x%y
402 // into a single instruction.
403 //
404 // Scalar integer multiply-high is also lowered to use two-result
405 // operations, to match the available instructions. However, plain multiply
406 // (low) operations are left as Legal, as there are single-result
407 // instructions for this in x86. Using the two-result multiply instructions
408 // when both high and low results are needed must be arranged by dagcombine.
409 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
410 MVT VT = IntVTs[i];
411 setOperationAction(ISD::MULHS, VT, Expand);
412 setOperationAction(ISD::MULHU, VT, Expand);
413 setOperationAction(ISD::SDIV, VT, Expand);
414 setOperationAction(ISD::UDIV, VT, Expand);
415 setOperationAction(ISD::SREM, VT, Expand);
416 setOperationAction(ISD::UREM, VT, Expand);
417
418 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
419 setOperationAction(ISD::ADDC, VT, Custom);
420 setOperationAction(ISD::ADDE, VT, Custom);
421 setOperationAction(ISD::SUBC, VT, Custom);
422 setOperationAction(ISD::SUBE, VT, Custom);
423 }
424
425 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
426 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
427 setOperationAction(ISD::BR_CC , MVT::f32, Expand);
428 setOperationAction(ISD::BR_CC , MVT::f64, Expand);
429 setOperationAction(ISD::BR_CC , MVT::f80, Expand);
430 setOperationAction(ISD::BR_CC , MVT::i8, Expand);
431 setOperationAction(ISD::BR_CC , MVT::i16, Expand);
432 setOperationAction(ISD::BR_CC , MVT::i32, Expand);
433 setOperationAction(ISD::BR_CC , MVT::i64, Expand);
434 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
435 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
436 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
437 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
438 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
439 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
440 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand);
441 if (Subtarget->is64Bit())
442 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
443 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
445 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
446 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
447 setOperationAction(ISD::FREM , MVT::f32 , Expand);
448 setOperationAction(ISD::FREM , MVT::f64 , Expand);
449 setOperationAction(ISD::FREM , MVT::f80 , Expand);
450 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
451
452 // Promote the i8 variants and force them on up to i32 which has a shorter
453 // encoding.
454 setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
455 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32);
456 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote);
457 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32);
458 if (Subtarget->hasBMI()) {
459 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand);
460 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand);
461 if (Subtarget->is64Bit())
462 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
463 } else {
464 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
465 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
466 if (Subtarget->is64Bit())
467 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
468 }
469
470 if (Subtarget->hasLZCNT()) {
471 // When promoting the i8 variants, force them to i32 for a shorter
472 // encoding.
473 setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
474 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32);
475 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote);
476 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand);
478 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand);
479 if (Subtarget->is64Bit())
480 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
481 } else {
482 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
483 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
484 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
485 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
486 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
487 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
488 if (Subtarget->is64Bit()) {
489 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
490 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
491 }
492 }
493
494 // Special handling for half-precision floating point conversions.
495 // If we don't have F16C support, then lower half float conversions
496 // into library calls.
497 if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
498 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
499 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
500 }
501
502 // There's never any support for operations beyond MVT::f32.
503 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
504 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
505 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
506 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
507
508 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
509 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
510 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
511 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
512 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
513 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
514
515 if (Subtarget->hasPOPCNT()) {
516 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
517 } else {
518 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
519 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
520 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
521 if (Subtarget->is64Bit())
522 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
523 }
524
525 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
526
527 if (!Subtarget->hasMOVBE())
528 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
529
530 // These should be promoted to a larger select which is supported.
531 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
532 // X86 wants to expand cmov itself.
533 setOperationAction(ISD::SELECT , MVT::i8 , Custom);
534 setOperationAction(ISD::SELECT , MVT::i16 , Custom);
535 setOperationAction(ISD::SELECT , MVT::i32 , Custom);
536 setOperationAction(ISD::SELECT , MVT::f32 , Custom);
537 setOperationAction(ISD::SELECT , MVT::f64 , Custom);
538 setOperationAction(ISD::SELECT , MVT::f80 , Custom);
539 setOperationAction(ISD::SETCC , MVT::i8 , Custom);
540 setOperationAction(ISD::SETCC , MVT::i16 , Custom);
541 setOperationAction(ISD::SETCC , MVT::i32 , Custom);
542 setOperationAction(ISD::SETCC , MVT::f32 , Custom);
543 setOperationAction(ISD::SETCC , MVT::f64 , Custom);
544 setOperationAction(ISD::SETCC , MVT::f80 , Custom);
545 if (Subtarget->is64Bit()) {
546 setOperationAction(ISD::SELECT , MVT::i64 , Custom);
547 setOperationAction(ISD::SETCC , MVT::i64 , Custom);
548 }
549 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
550 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
551 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
552 // support continuation, user-level threading, and etc.. As a result, no
553 // other SjLj exception interfaces are implemented and please don't build
554 // your own exception handling based on them.
555 // LLVM/Clang supports zero-cost DWARF exception handling.
556 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
557 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
558
559 // Darwin ABI issue.
560 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
561 setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
562 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
563 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
564 if (Subtarget->is64Bit())
565 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
566 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
567 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
568 if (Subtarget->is64Bit()) {
569 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
570 setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
571 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
572 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
573 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
574 }
575 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
576 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
577 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
578 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
579 if (Subtarget->is64Bit()) {
580 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
581 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
582 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
583 }
584
585 if (Subtarget->hasSSE1())
586 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
587
588 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
589
590 // Expand certain atomics
591 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
592 MVT VT = IntVTs[i];
593 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
594 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
595 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
596 }
597
598 if (Subtarget->hasCmpxchg16b()) {
599 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
600 }
601
602 // FIXME - use subtarget debug flags
603 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
604 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
605 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
606 }
607
608 if (Subtarget->is64Bit()) {
609 setExceptionPointerRegister(X86::RAX);
610 setExceptionSelectorRegister(X86::RDX);
611 } else {
612 setExceptionPointerRegister(X86::EAX);
613 setExceptionSelectorRegister(X86::EDX);
614 }
615 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
616 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
617
618 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
619 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
620
621 setOperationAction(ISD::TRAP, MVT::Other, Legal);
622 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
623
624 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
625 setOperationAction(ISD::VASTART , MVT::Other, Custom);
626 setOperationAction(ISD::VAEND , MVT::Other, Expand);
627 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
628 // TargetInfo::X86_64ABIBuiltinVaList
629 setOperationAction(ISD::VAARG , MVT::Other, Custom);
630 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
631 } else {
632 // TargetInfo::CharPtrBuiltinVaList
633 setOperationAction(ISD::VAARG , MVT::Other, Expand);
634 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
635 }
636
637 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
638 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
639
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
641
642 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
643 // f32 and f64 use SSE.
644 // Set up the FP register classes.
645 addRegisterClass(MVT::f32, &X86::FR32RegClass);
646 addRegisterClass(MVT::f64, &X86::FR64RegClass);
647
648 // Use ANDPD to simulate FABS.
649 setOperationAction(ISD::FABS , MVT::f64, Custom);
650 setOperationAction(ISD::FABS , MVT::f32, Custom);
651
652 // Use XORP to simulate FNEG.
653 setOperationAction(ISD::FNEG , MVT::f64, Custom);
654 setOperationAction(ISD::FNEG , MVT::f32, Custom);
655
656 // Use ANDPD and ORPD to simulate FCOPYSIGN.
657 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
658 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
659
660 // Lower this to FGETSIGNx86 plus an AND.
661 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
662 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
663
664 // We don't support sin/cos/fmod
665 setOperationAction(ISD::FSIN , MVT::f64, Expand);
666 setOperationAction(ISD::FCOS , MVT::f64, Expand);
667 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
668 setOperationAction(ISD::FSIN , MVT::f32, Expand);
669 setOperationAction(ISD::FCOS , MVT::f32, Expand);
670 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
671
672 // Expand FP immediates into loads from the stack, except for the special
673 // cases we handle.
674 addLegalFPImmediate(APFloat(+0.0)); // xorpd
675 addLegalFPImmediate(APFloat(+0.0f)); // xorps
676 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
677 // Use SSE for f32, x87 for f64.
678 // Set up the FP register classes.
679 addRegisterClass(MVT::f32, &X86::FR32RegClass);
680 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
681
682 // Use ANDPS to simulate FABS.
683 setOperationAction(ISD::FABS , MVT::f32, Custom);
684
685 // Use XORP to simulate FNEG.
686 setOperationAction(ISD::FNEG , MVT::f32, Custom);
687
688 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
689
690 // Use ANDPS and ORPS to simulate FCOPYSIGN.
691 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
692 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
693
694 // We don't support sin/cos/fmod
695 setOperationAction(ISD::FSIN , MVT::f32, Expand);
696 setOperationAction(ISD::FCOS , MVT::f32, Expand);
697 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
698
699 // Special cases we handle for FP constants.
700 addLegalFPImmediate(APFloat(+0.0f)); // xorps
701 addLegalFPImmediate(APFloat(+0.0)); // FLD0
702 addLegalFPImmediate(APFloat(+1.0)); // FLD1
703 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
704 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
705
706 if (!TM.Options.UnsafeFPMath) {
707 setOperationAction(ISD::FSIN , MVT::f64, Expand);
708 setOperationAction(ISD::FCOS , MVT::f64, Expand);
709 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
710 }
711 } else if (!TM.Options.UseSoftFloat) {
712 // f32 and f64 in x87.
713 // Set up the FP register classes.
714 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
715 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
716
717 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
718 setOperationAction(ISD::UNDEF, MVT::f32, Expand);
719 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
720 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
721
722 if (!TM.Options.UnsafeFPMath) {
723 setOperationAction(ISD::FSIN , MVT::f64, Expand);
724 setOperationAction(ISD::FSIN , MVT::f32, Expand);
725 setOperationAction(ISD::FCOS , MVT::f64, Expand);
726 setOperationAction(ISD::FCOS , MVT::f32, Expand);
727 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
728 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
729 }
730 addLegalFPImmediate(APFloat(+0.0)); // FLD0
731 addLegalFPImmediate(APFloat(+1.0)); // FLD1
732 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
733 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
734 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
735 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
736 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
737 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
738 }
739
740 // We don't support FMA.
741 setOperationAction(ISD::FMA, MVT::f64, Expand);
742 setOperationAction(ISD::FMA, MVT::f32, Expand);
743
744 // Long double always uses X87.
745 if (!TM.Options.UseSoftFloat) {
746 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
747 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
748 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
749 {
750 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
751 addLegalFPImmediate(TmpFlt); // FLD0
752 TmpFlt.changeSign();
753 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
754
755 bool ignored;
756 APFloat TmpFlt2(+1.0);
757 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
758 &ignored);
759 addLegalFPImmediate(TmpFlt2); // FLD1
760 TmpFlt2.changeSign();
761 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
762 }
763
764 if (!TM.Options.UnsafeFPMath) {
765 setOperationAction(ISD::FSIN , MVT::f80, Expand);
766 setOperationAction(ISD::FCOS , MVT::f80, Expand);
767 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
768 }
769
770 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
771 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
772 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
773 setOperationAction(ISD::FRINT, MVT::f80, Expand);
774 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
775 setOperationAction(ISD::FMA, MVT::f80, Expand);
776 }
777
778 // Always use a library call for pow.
779 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
780 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
781 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
782
783 setOperationAction(ISD::FLOG, MVT::f80, Expand);
784 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
785 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
786 setOperationAction(ISD::FEXP, MVT::f80, Expand);
787 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
788 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
789 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
790
791 // First set operation action for all vector types to either promote
792 // (for widening) or expand (for scalarization). Then we will selectively
793 // turn on ones that can be effectively codegen'd.
794 for (MVT VT : MVT::vector_valuetypes()) {
795 setOperationAction(ISD::ADD , VT, Expand);
796 setOperationAction(ISD::SUB , VT, Expand);
797 setOperationAction(ISD::FADD, VT, Expand);
798 setOperationAction(ISD::FNEG, VT, Expand);
799 setOperationAction(ISD::FSUB, VT, Expand);
800 setOperationAction(ISD::MUL , VT, Expand);
801 setOperationAction(ISD::FMUL, VT, Expand);
802 setOperationAction(ISD::SDIV, VT, Expand);
803 setOperationAction(ISD::UDIV, VT, Expand);
804 setOperationAction(ISD::FDIV, VT, Expand);
805 setOperationAction(ISD::SREM, VT, Expand);
806 setOperationAction(ISD::UREM, VT, Expand);
807 setOperationAction(ISD::LOAD, VT, Expand);
808 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
810 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
811 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
812 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
813 setOperationAction(ISD::FABS, VT, Expand);
814 setOperationAction(ISD::FSIN, VT, Expand);
815 setOperationAction(ISD::FSINCOS, VT, Expand);
816 setOperationAction(ISD::FCOS, VT, Expand);
817 setOperationAction(ISD::FSINCOS, VT, Expand);
818 setOperationAction(ISD::FREM, VT, Expand);
819 setOperationAction(ISD::FMA, VT, Expand);
820 setOperationAction(ISD::FPOWI, VT, Expand);
821 setOperationAction(ISD::FSQRT, VT, Expand);
822 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
823 setOperationAction(ISD::FFLOOR, VT, Expand);
824 setOperationAction(ISD::FCEIL, VT, Expand);
825 setOperationAction(ISD::FTRUNC, VT, Expand);
826 setOperationAction(ISD::FRINT, VT, Expand);
827 setOperationAction(ISD::FNEARBYINT, VT, Expand);
828 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
829 setOperationAction(ISD::MULHS, VT, Expand);
830 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
831 setOperationAction(ISD::MULHU, VT, Expand);
832 setOperationAction(ISD::SDIVREM, VT, Expand);
833 setOperationAction(ISD::UDIVREM, VT, Expand);
834 setOperationAction(ISD::FPOW, VT, Expand);
835 setOperationAction(ISD::CTPOP, VT, Expand);
836 setOperationAction(ISD::CTTZ, VT, Expand);
837 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
838 setOperationAction(ISD::CTLZ, VT, Expand);
839 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
840 setOperationAction(ISD::SHL, VT, Expand);
841 setOperationAction(ISD::SRA, VT, Expand);
842 setOperationAction(ISD::SRL, VT, Expand);
843 setOperationAction(ISD::ROTL, VT, Expand);
844 setOperationAction(ISD::ROTR, VT, Expand);
845 setOperationAction(ISD::BSWAP, VT, Expand);
846 setOperationAction(ISD::SETCC, VT, Expand);
847 setOperationAction(ISD::FLOG, VT, Expand);
848 setOperationAction(ISD::FLOG2, VT, Expand);
849 setOperationAction(ISD::FLOG10, VT, Expand);
850 setOperationAction(ISD::FEXP, VT, Expand);
851 setOperationAction(ISD::FEXP2, VT, Expand);
852 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
853 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
854 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
855 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
856 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
857 setOperationAction(ISD::TRUNCATE, VT, Expand);
858 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
859 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
860 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
861 setOperationAction(ISD::VSELECT, VT, Expand);
862 setOperationAction(ISD::SELECT_CC, VT, Expand);
863 for (MVT InnerVT : MVT::vector_valuetypes()) {
864 setTruncStoreAction(InnerVT, VT, Expand);
865
866 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
867 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
868
869 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
870 // types, we have to deal with them whether we ask for Expansion or not.
871 // Setting Expand causes its own optimisation problems though, so leave
872 // them legal.
873 if (VT.getVectorElementType() == MVT::i1)
874 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875 }
876 }
877
878 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879 // with -msoft-float, disable use of MMX as well.
880 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
881 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882 // No operations on x86mmx supported, everything uses intrinsics.
883 }
884
885 // MMX-sized vectors (other than x86mmx) are expected to be expanded
886 // into smaller operations.
887 setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
888 setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
889 setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
890 setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
891 setOperationAction(ISD::AND, MVT::v8i8, Expand);
892 setOperationAction(ISD::AND, MVT::v4i16, Expand);
893 setOperationAction(ISD::AND, MVT::v2i32, Expand);
894 setOperationAction(ISD::AND, MVT::v1i64, Expand);
895 setOperationAction(ISD::OR, MVT::v8i8, Expand);
896 setOperationAction(ISD::OR, MVT::v4i16, Expand);
897 setOperationAction(ISD::OR, MVT::v2i32, Expand);
898 setOperationAction(ISD::OR, MVT::v1i64, Expand);
899 setOperationAction(ISD::XOR, MVT::v8i8, Expand);
900 setOperationAction(ISD::XOR, MVT::v4i16, Expand);
901 setOperationAction(ISD::XOR, MVT::v2i32, Expand);
902 setOperationAction(ISD::XOR, MVT::v1i64, Expand);
903 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
904 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
905 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
906 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
907 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
908 setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
909 setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
910 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
911 setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
912 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
913 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
914 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
915 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
916
917 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
918 addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
919
920 setOperationAction(ISD::FADD, MVT::v4f32, Legal);
921 setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
922 setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
923 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
924 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
925 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
926 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
927 setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
928 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
929 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
931 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
932 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
933 }
934
935 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
936 addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
937
938 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
939 // registers cannot be used even for integer operations.
940 addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
941 addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
942 addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
943 addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
944
945 setOperationAction(ISD::ADD, MVT::v16i8, Legal);
946 setOperationAction(ISD::ADD, MVT::v8i16, Legal);
947 setOperationAction(ISD::ADD, MVT::v4i32, Legal);
948 setOperationAction(ISD::ADD, MVT::v2i64, Legal);
949 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
950 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
951 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
952 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
953 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
954 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
955 setOperationAction(ISD::SUB, MVT::v16i8, Legal);
956 setOperationAction(ISD::SUB, MVT::v8i16, Legal);
957 setOperationAction(ISD::SUB, MVT::v4i32, Legal);
958 setOperationAction(ISD::SUB, MVT::v2i64, Legal);
959 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
960 setOperationAction(ISD::FADD, MVT::v2f64, Legal);
961 setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
962 setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
963 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
964 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
965 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
966 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
967
968 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
969 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
970 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
971 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
972
973 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
974 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
977 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
978
979 // Only provide customized ctpop vector bit twiddling for vector types we
980 // know to perform better than using the popcnt instructions on each vector
981 // element. If popcnt isn't supported, always provide the custom version.
982 if (!Subtarget->hasPOPCNT()) {
983 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
984 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
985 }
986
987 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
988 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
989 MVT VT = (MVT::SimpleValueType)i;
990 // Do not attempt to custom lower non-power-of-2 vectors
991 if (!isPowerOf2_32(VT.getVectorNumElements()))
992 continue;
993 // Do not attempt to custom lower non-128-bit vectors
994 if (!VT.is128BitVector())
995 continue;
996 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
997 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
998 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
999 }
1000
1001 // We support custom legalizing of sext and anyext loads for specific
1002 // memory vector types which we can load as a scalar (or sequence of
1003 // scalars) and extend in-register to a legal 128-bit vector type. For sext
1004 // loads these must work with a single scalar load.
1005 for (MVT VT : MVT::integer_vector_valuetypes()) {
1006 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1007 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1008 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1010 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1011 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1012 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1013 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1014 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1015 }
1016
1017 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
1018 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
1019 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
1020 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
1021 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
1022 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1023
1024 if (Subtarget->is64Bit()) {
1025 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1026 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1027 }
1028
1029 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1030 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1031 MVT VT = (MVT::SimpleValueType)i;
1032
1033 // Do not attempt to promote non-128-bit vectors
1034 if (!VT.is128BitVector())
1035 continue;
1036
1037 setOperationAction(ISD::AND, VT, Promote);
1038 AddPromotedToType (ISD::AND, VT, MVT::v2i64);
1039 setOperationAction(ISD::OR, VT, Promote);
1040 AddPromotedToType (ISD::OR, VT, MVT::v2i64);
1041 setOperationAction(ISD::XOR, VT, Promote);
1042 AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
1043 setOperationAction(ISD::LOAD, VT, Promote);
1044 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
1045 setOperationAction(ISD::SELECT, VT, Promote);
1046 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1047 }
1048
1049 // Custom lower v2i64 and v2f64 selects.
1050 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
1051 setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
1052 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1053 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1054
1055 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1056 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1057
1058 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
1059 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1060 // As there is no 64-bit GPR available, we need build a special custom
1061 // sequence to convert from v2i32 to v2f32.
1062 if (!Subtarget->is64Bit())
1063 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1064
1065 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1066 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1067
1068 for (MVT VT : MVT::fp_vector_valuetypes())
1069 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1070
1071 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1072 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1073 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1074 }
1075
1076 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1077 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1078 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1079 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1080 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1081 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1082 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1083 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1084 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1085 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1086 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1087
1088 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1089 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1090 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1091 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1092 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
1093 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
1094 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
1095 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
1096 setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
1097 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
1098
1099 // FIXME: Do we need to handle scalar-to-vector here?
1100 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1101
1102 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
1103 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
1104 setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
1105 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1106 setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
1107 // There is no BLENDI for byte vectors. We don't need to custom lower
1108 // some vselects for now.
1109 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1110
1111 // SSE41 brings specific instructions for doing vector sign extend even in
1112 // cases where we don't have SRA.
1113 for (MVT VT : MVT::integer_vector_valuetypes()) {
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1115 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1116 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1117 }
1118
1119 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1120 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1121 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1122 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1123 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1124 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1125 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1126
1127 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
1128 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1129 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1130 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1131 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1132 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1133
1134 // i8 and i16 vectors are custom because the source register and source
1135 // source memory operand types are not the same width. f32 vectors are
1136 // custom since the immediate controlling the insert encodes additional
1137 // information.
1138 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1139 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1140 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1141 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1142
1143 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1145 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1146 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1147
1148 // FIXME: these should be Legal, but that's only for the case where
1149 // the index is constant. For now custom expand to deal with that.
1150 if (Subtarget->is64Bit()) {
1151 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
1152 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1153 }
1154 }
1155
1156 if (Subtarget->hasSSE2()) {
1157 setOperationAction(ISD::SRL, MVT::v8i16, Custom);
1158 setOperationAction(ISD::SRL, MVT::v16i8, Custom);
1159
1160 setOperationAction(ISD::SHL, MVT::v8i16, Custom);
1161 setOperationAction(ISD::SHL, MVT::v16i8, Custom);
1162
1163 setOperationAction(ISD::SRA, MVT::v8i16, Custom);
1164 setOperationAction(ISD::SRA, MVT::v16i8, Custom);
1165
1166 // In the customized shift lowering, the legal cases in AVX2 will be
1167 // recognized.
1168 setOperationAction(ISD::SRL, MVT::v2i64, Custom);
1169 setOperationAction(ISD::SRL, MVT::v4i32, Custom);
1170
1171 setOperationAction(ISD::SHL, MVT::v2i64, Custom);
1172 setOperationAction(ISD::SHL, MVT::v4i32, Custom);
1173
1174 setOperationAction(ISD::SRA, MVT::v4i32, Custom);
1175 }
1176
1177 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1178 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
1179 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1180 addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
1181 addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
1182 addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
1183 addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
1184
1185 setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
1186 setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
1187 setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
1188
1189 setOperationAction(ISD::FADD, MVT::v8f32, Legal);
1190 setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
1191 setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
1192 setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
1193 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
1194 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
1195 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal);
1196 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal);
1197 setOperationAction(ISD::FRINT, MVT::v8f32, Legal);
1198 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal);
1199 setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
1200 setOperationAction(ISD::FABS, MVT::v8f32, Custom);
1201
1202 setOperationAction(ISD::FADD, MVT::v4f64, Legal);
1203 setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
1204 setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
1205 setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
1206 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
1207 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
1208 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
1209 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
1210 setOperationAction(ISD::FRINT, MVT::v4f64, Legal);
1211 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal);
1212 setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
1213 setOperationAction(ISD::FABS, MVT::v4f64, Custom);
1214
1215 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1216 // even though v8i16 is a legal type.
1217 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1218 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1219 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1220
1221 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1222 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1223 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1224
1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1226 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1227
1228 for (MVT VT : MVT::fp_vector_valuetypes())
1229 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1230
1231 setOperationAction(ISD::SRL, MVT::v16i16, Custom);
1232 setOperationAction(ISD::SRL, MVT::v32i8, Custom);
1233
1234 setOperationAction(ISD::SHL, MVT::v16i16, Custom);
1235 setOperationAction(ISD::SHL, MVT::v32i8, Custom);
1236
1237 setOperationAction(ISD::SRA, MVT::v16i16, Custom);
1238 setOperationAction(ISD::SRA, MVT::v32i8, Custom);
1239
1240 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1241 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1242 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1243 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1244
1245 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1246 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1247 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1248
1249 setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
1251 setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
1252 setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
1253
1254 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1255 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1256 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1257 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1258 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1259 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1260 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1261 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1262 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1263 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1264 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1265 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1266
1267 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1268 setOperationAction(ISD::FMA, MVT::v8f32, Legal);
1269 setOperationAction(ISD::FMA, MVT::v4f64, Legal);
1270 setOperationAction(ISD::FMA, MVT::v4f32, Legal);
1271 setOperationAction(ISD::FMA, MVT::v2f64, Legal);
1272 setOperationAction(ISD::FMA, MVT::f32, Legal);
1273 setOperationAction(ISD::FMA, MVT::f64, Legal);
1274 }
1275
1276 if (Subtarget->hasInt256()) {
1277 setOperationAction(ISD::ADD, MVT::v4i64, Legal);
1278 setOperationAction(ISD::ADD, MVT::v8i32, Legal);
1279 setOperationAction(ISD::ADD, MVT::v16i16, Legal);
1280 setOperationAction(ISD::ADD, MVT::v32i8, Legal);
1281
1282 setOperationAction(ISD::SUB, MVT::v4i64, Legal);
1283 setOperationAction(ISD::SUB, MVT::v8i32, Legal);
1284 setOperationAction(ISD::SUB, MVT::v16i16, Legal);
1285 setOperationAction(ISD::SUB, MVT::v32i8, Legal);
1286
1287 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1288 setOperationAction(ISD::MUL, MVT::v8i32, Legal);
1289 setOperationAction(ISD::MUL, MVT::v16i16, Legal);
1290 // Don't lower v32i8 because there is no 128-bit byte mul
1291
1292 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1293 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1294 setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
1295 setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
1296
1297 setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
1298 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1299
1300 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1301 // when we have a 256bit-wide blend with immediate.
1302 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1303
1304 // Only provide customized ctpop vector bit twiddling for vector types we
1305 // know to perform better than using the popcnt instructions on each
1306 // vector element. If popcnt isn't supported, always provide the custom
1307 // version.
1308 if (!Subtarget->hasPOPCNT())
1309 setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
1310
1311 // Custom CTPOP always performs better on natively supported v8i32
1312 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
1313
1314 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1315 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1317 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1318 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1320 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1321
1322 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1324 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1325 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1326 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1327 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1328 } else {
1329 setOperationAction(ISD::ADD, MVT::v4i64, Custom);
1330 setOperationAction(ISD::ADD, MVT::v8i32, Custom);
1331 setOperationAction(ISD::ADD, MVT::v16i16, Custom);
1332 setOperationAction(ISD::ADD, MVT::v32i8, Custom);
1333
1334 setOperationAction(ISD::SUB, MVT::v4i64, Custom);
1335 setOperationAction(ISD::SUB, MVT::v8i32, Custom);
1336 setOperationAction(ISD::SUB, MVT::v16i16, Custom);
1337 setOperationAction(ISD::SUB, MVT::v32i8, Custom);
1338
1339 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1340 setOperationAction(ISD::MUL, MVT::v8i32, Custom);
1341 setOperationAction(ISD::MUL, MVT::v16i16, Custom);
1342 // Don't lower v32i8 because there is no 128-bit byte mul
1343 }
1344
1345 // In the customized shift lowering, the legal cases in AVX2 will be
1346 // recognized.
1347 setOperationAction(ISD::SRL, MVT::v4i64, Custom);
1348 setOperationAction(ISD::SRL, MVT::v8i32, Custom);
1349
1350 setOperationAction(ISD::SHL, MVT::v4i64, Custom);
1351 setOperationAction(ISD::SHL, MVT::v8i32, Custom);
1352
1353 setOperationAction(ISD::SRA, MVT::v8i32, Custom);
1354
1355 // Custom lower several nodes for 256-bit types.
1356 for (MVT VT : MVT::vector_valuetypes()) {
1357 if (VT.getScalarSizeInBits() >= 32) {
1358 setOperationAction(ISD::MLOAD, VT, Legal);
1359 setOperationAction(ISD::MSTORE, VT, Legal);
1360 }
1361 // Extract subvector is special because the value type
1362 // (result) is 128-bit but the source is 256-bit wide.
1363 if (VT.is128BitVector()) {
1364 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1365 }
1366 // Do not attempt to custom lower other non-256-bit vectors
1367 if (!VT.is256BitVector())
1368 continue;
1369
1370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1371 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1372 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1373 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1374 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1375 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1376 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1377 }
1378
1379 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1380 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1381 MVT VT = (MVT::SimpleValueType)i;
1382
1383 // Do not attempt to promote non-256-bit vectors
1384 if (!VT.is256BitVector())
1385 continue;
1386
1387 setOperationAction(ISD::AND, VT, Promote);
1388 AddPromotedToType (ISD::AND, VT, MVT::v4i64);
1389 setOperationAction(ISD::OR, VT, Promote);
1390 AddPromotedToType (ISD::OR, VT, MVT::v4i64);
1391 setOperationAction(ISD::XOR, VT, Promote);
1392 AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
1393 setOperationAction(ISD::LOAD, VT, Promote);
1394 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
1395 setOperationAction(ISD::SELECT, VT, Promote);
1396 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1397 }
1398 }
1399
1400 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1401 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1402 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1403 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1404 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1405
1406 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1407 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1408 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1409
1410 for (MVT VT : MVT::fp_vector_valuetypes())
1411 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1412
1413 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1414 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1415 setOperationAction(ISD::XOR, MVT::i1, Legal);
1416 setOperationAction(ISD::OR, MVT::i1, Legal);
1417 setOperationAction(ISD::AND, MVT::i1, Legal);
1418 setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
1419 setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
1420 setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
1421 setOperationAction(ISD::LOAD, MVT::v16i32, Legal);
1422 setOperationAction(ISD::LOAD, MVT::v16i1, Legal);
1423
1424 setOperationAction(ISD::FADD, MVT::v16f32, Legal);
1425 setOperationAction(ISD::FSUB, MVT::v16f32, Legal);
1426 setOperationAction(ISD::FMUL, MVT::v16f32, Legal);
1427 setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
1428 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
1429 setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
1430
1431 setOperationAction(ISD::FADD, MVT::v8f64, Legal);
1432 setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
1433 setOperationAction(ISD::FMUL, MVT::v8f64, Legal);
1434 setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
1435 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
1436 setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
1437 setOperationAction(ISD::FMA, MVT::v8f64, Legal);
1438 setOperationAction(ISD::FMA, MVT::v16f32, Legal);
1439
1440 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
1441 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
1442 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
1443 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
1444 if (Subtarget->is64Bit()) {
1445 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
1446 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
1447 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
1448 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
1449 }
1450 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1451 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1452 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1453 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1454 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1455 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1456 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1457 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1458 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1459 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1461 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1462 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1463 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1464
1465 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1468 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1469 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1470 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1471 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1472 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1473 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1474 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1475 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1477 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1478
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1481 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1482 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1483 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1484 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
1485
1486 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1487 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1488
1489 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1490
1491 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1492 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1493 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1494 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1495 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1496 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1497 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1498 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1499 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1500
1501 setOperationAction(ISD::ADD, MVT::v8i64, Legal);
1502 setOperationAction(ISD::ADD, MVT::v16i32, Legal);
1503
1504 setOperationAction(ISD::SUB, MVT::v8i64, Legal);
1505 setOperationAction(ISD::SUB, MVT::v16i32, Legal);
1506
1507 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1508
1509 setOperationAction(ISD::SRL, MVT::v8i64, Custom);
1510 setOperationAction(ISD::SRL, MVT::v16i32, Custom);
1511
1512 setOperationAction(ISD::SHL, MVT::v8i64, Custom);
1513 setOperationAction(ISD::SHL, MVT::v16i32, Custom);
1514
1515 setOperationAction(ISD::SRA, MVT::v8i64, Custom);
1516 setOperationAction(ISD::SRA, MVT::v16i32, Custom);
1517
1518 setOperationAction(ISD::AND, MVT::v8i64, Legal);
1519 setOperationAction(ISD::OR, MVT::v8i64, Legal);
1520 setOperationAction(ISD::XOR, MVT::v8i64, Legal);
1521 setOperationAction(ISD::AND, MVT::v16i32, Legal);
1522 setOperationAction(ISD::OR, MVT::v16i32, Legal);
1523 setOperationAction(ISD::XOR, MVT::v16i32, Legal);
1524
1525 if (Subtarget->hasCDI()) {
1526 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1527 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1528 }
1529
1530 // Custom lower several nodes.
1531 for (MVT VT : MVT::vector_valuetypes()) {
1532 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1533 // Extract subvector is special because the value type
1534 // (result) is 256/128-bit but the source is 512-bit wide.
1535 if (VT.is128BitVector() || VT.is256BitVector()) {
1536 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1537 }
1538 if (VT.getVectorElementType() == MVT::i1)
1539 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1540
1541 // Do not attempt to custom lower other non-512-bit vectors
1542 if (!VT.is512BitVector())
1543 continue;
1544
1545 if ( EltSize >= 32) {
1546 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1551 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1552 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1555 }
1556 }
1557 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1558 MVT VT = (MVT::SimpleValueType)i;
1559
1560 // Do not attempt to promote non-512-bit vectors.
1561 if (!VT.is512BitVector())
1562 continue;
1563
1564 setOperationAction(ISD::SELECT, VT, Promote);
1565 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1566 }
1567 }// has AVX-512
1568
1569 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1570 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1571 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1572
1573 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1574 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1575
1576 setOperationAction(ISD::LOAD, MVT::v32i16, Legal);
1577 setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
1578 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1579 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1580 setOperationAction(ISD::ADD, MVT::v32i16, Legal);
1581 setOperationAction(ISD::ADD, MVT::v64i8, Legal);
1582 setOperationAction(ISD::SUB, MVT::v32i16, Legal);
1583 setOperationAction(ISD::SUB, MVT::v64i8, Legal);
1584 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1585
1586 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1587 const MVT VT = (MVT::SimpleValueType)i;
1588
1589 const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1590
1591 // Do not attempt to promote non-512-bit vectors.
1592 if (!VT.is512BitVector())
1593 continue;
1594
1595 if (EltSize < 32) {
1596 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1597 setOperationAction(ISD::VSELECT, VT, Legal);
1598 }
1599 }
1600 }
1601
1602 if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1603 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1604 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1605
1606 setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
1607 setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
1608 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
1609
1610 setOperationAction(ISD::AND, MVT::v8i32, Legal);
1611 setOperationAction(ISD::OR, MVT::v8i32, Legal);
1612 setOperationAction(ISD::XOR, MVT::v8i32, Legal);
1613 setOperationAction(ISD::AND, MVT::v4i32, Legal);
1614 setOperationAction(ISD::OR, MVT::v4i32, Legal);
1615 setOperationAction(ISD::XOR, MVT::v4i32, Legal);
1616 }
1617
1618 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1619 // of this type with custom code.
1620 for (MVT VT : MVT::vector_valuetypes())
1621 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1622
1623 // We want to custom lower some of our intrinsics.
1624 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1625 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1626 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1627 if (!Subtarget->is64Bit())
1628 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1629
1630 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1631 // handle type legalization for these operations here.
1632 //
1633 // FIXME: We really should do custom legalization for addition and
1634 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1635 // than generic legalization for 64-bit multiplication-with-overflow, though.
1636 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1637 // Add/Sub/Mul with overflow operations are custom lowered.
1638 MVT VT = IntVTs[i];
1639 setOperationAction(ISD::SADDO, VT, Custom);
1640 setOperationAction(ISD::UADDO, VT, Custom);
1641 setOperationAction(ISD::SSUBO, VT, Custom);
1642 setOperationAction(ISD::USUBO, VT, Custom);
1643 setOperationAction(ISD::SMULO, VT, Custom);
1644 setOperationAction(ISD::UMULO, VT, Custom);
1645 }
1646
1647
1648 if (!Subtarget->is64Bit()) {
1649 // These libcalls are not available in 32-bit.
1650 setLibcallName(RTLIB::SHL_I128, nullptr);
1651 setLibcallName(RTLIB::SRL_I128, nullptr);
1652 setLibcallName(RTLIB::SRA_I128, nullptr);
1653 }
1654
1655 // Combine sin / cos into one node or libcall if possible.
1656 if (Subtarget->hasSinCos()) {
1657 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1658 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1659 if (Subtarget->isTargetDarwin()) {
1660 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1661 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1662 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1663 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1664 }
1665 }
1666
1667 if (Subtarget->isTargetWin64()) {
1668 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1669 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1670 setOperationAction(ISD::SREM, MVT::i128, Custom);
1671 setOperationAction(ISD::UREM, MVT::i128, Custom);
1672 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1673 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1674 }
1675
1676 // We have target-specific dag combine patterns for the following nodes:
1677 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1678 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1679 setTargetDAGCombine(ISD::VSELECT);
1680 setTargetDAGCombine(ISD::SELECT);
1681 setTargetDAGCombine(ISD::SHL);
1682 setTargetDAGCombine(ISD::SRA);
1683 setTargetDAGCombine(ISD::SRL);
1684 setTargetDAGCombine(ISD::OR);
1685 setTargetDAGCombine(ISD::AND);
1686 setTargetDAGCombine(ISD::ADD);
1687 setTargetDAGCombine(ISD::FADD);
1688 setTargetDAGCombine(ISD::FSUB);
1689 setTargetDAGCombine(ISD::FMA);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::ZERO_EXTEND);
1696 setTargetDAGCombine(ISD::ANY_EXTEND);
1697 setTargetDAGCombine(ISD::SIGN_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699 setTargetDAGCombine(ISD::TRUNCATE);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::SETCC);
1702 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703 setTargetDAGCombine(ISD::BUILD_VECTOR);
1704 setTargetDAGCombine(ISD::MUL);
1705 setTargetDAGCombine(ISD::XOR);
1706
1707 computeRegisterProperties();
1708
1709 // On Darwin, -Os means optimize for size without hurting performance,
1710 // do not reduce the limit.
1711 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717 setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719 // Predictable cmov don't hurt on atom because it's in-order.
1720 PredictableSelectIsExpensive = !Subtarget->isAtom();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724 verifyIntrinsicTables();
1725}
1726
1727// This has so far only been implemented for 64-bit MachO.
1728bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730}
1731
1732TargetLoweringBase::LegalizeTypeAction
1733X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1738
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1740}
1741
1742EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743 if (!VT.isVector())
1744 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746 const unsigned NumElts = VT.getVectorNumElements();
1747 const EVT EltVT = VT.getVectorElementType();
1748 if (VT.is512BitVector()) {
1749 if (Subtarget->hasAVX512())
1750 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751 EltVT == MVT::f32 || EltVT == MVT::f64)
1752 switch(NumElts) {
1753 case 8: return MVT::v8i1;
1754 case 16: return MVT::v16i1;
1755 }
1756 if (Subtarget->hasBWI())
1757 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758 switch(NumElts) {
1759 case 32: return MVT::v32i1;
1760 case 64: return MVT::v64i1;
1761 }
1762 }
1763
1764 if (VT.is256BitVector() || VT.is128BitVector()) {
1765 if (Subtarget->hasVLX())
1766 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767 EltVT == MVT::f32 || EltVT == MVT::f64)
1768 switch(NumElts) {
1769 case 2: return MVT::v2i1;
1770 case 4: return MVT::v4i1;
1771 case 8: return MVT::v8i1;
1772 }
1773 if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775 switch(NumElts) {
1776 case 8: return MVT::v8i1;
1777 case 16: return MVT::v16i1;
1778 case 32: return MVT::v32i1;
1779 }
1780 }
1781
1782 return VT.changeVectorElementTypeToInteger();
1783}
1784
1785/// Helper for getByValTypeAlignment to determine
1786/// the desired ByVal argument alignment.
1787static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788 if (MaxAlign == 16)
1789 return;
1790 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791 if (VTy->getBitWidth() == 128)
1792 MaxAlign = 16;
1793 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794 unsigned EltAlign = 0;
1795 getMaxByValAlign(ATy->getElementType(), EltAlign);
1796 if (EltAlign > MaxAlign)
1797 MaxAlign = EltAlign;
1798 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800 unsigned EltAlign = 0;
1801 getMaxByValAlign(STy->getElementType(i), EltAlign);
1802 if (EltAlign > MaxAlign)
1803 MaxAlign = EltAlign;
1804 if (MaxAlign == 16)
1805 break;
1806 }
1807 }
1808}
1809
1810/// Return the desired alignment for ByVal aggregate
1811/// function arguments in the caller parameter area. For X86, aggregates
1812/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813/// are at 4-byte boundaries.
1814unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815 if (Subtarget->is64Bit()) {
1816 // Max of 8 and alignment of type.
1817 unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818 if (TyAlign > 8)
1819 return TyAlign;
1820 return 8;
1821 }
1822
1823 unsigned Align = 4;
1824 if (Subtarget->hasSSE1())
1825 getMaxByValAlign(Ty, Align);
1826 return Align;
1827}
1828
1829/// Returns the target specific optimal type for load
1830/// and store operations as a result of memset, memcpy, and memmove
1831/// lowering. If DstAlign is zero that means it's safe to destination
1832/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833/// means there isn't a need to check it against alignment requirement,
1834/// probably because the source does not need to be loaded. If 'IsMemset' is
1835/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837/// source is constant so it does not need to be loaded.
1838/// It returns EVT::Other if the type should be determined using generic
1839/// target-independent logic.
1840EVT
1841X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842 unsigned DstAlign, unsigned SrcAlign,
1843 bool IsMemset, bool ZeroMemset,
1844 bool MemcpyStrSrc,
1845 MachineFunction &MF) const {
1846 const Function *F = MF.getFunction();
1847 if ((!IsMemset || ZeroMemset) &&
1848 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849 Attribute::NoImplicitFloat)) {
1850 if (Size >= 16 &&
1851 (Subtarget->isUnalignedMemAccessFast() ||
1852 ((DstAlign == 0 || DstAlign >= 16) &&
1853 (SrcAlign == 0 || SrcAlign >= 16)))) {
1854 if (Size >= 32) {
1855 if (Subtarget->hasInt256())
1856 return MVT::v8i32;
1857 if (Subtarget->hasFp256())
1858 return MVT::v8f32;
1859 }
1860 if (Subtarget->hasSSE2())
1861 return MVT::v4i32;
1862 if (Subtarget->hasSSE1())
1863 return MVT::v4f32;
1864 } else if (!MemcpyStrSrc && Size >= 8 &&
1865 !Subtarget->is64Bit() &&
1866 Subtarget->hasSSE2()) {
1867 // Do not use f64 to lower memcpy if source is string constant. It's
1868 // better to use i32 to avoid the loads.
1869 return MVT::f64;
1870 }
1871 }
1872 if (Subtarget->is64Bit() && Size >= 8)
1873 return MVT::i64;
1874 return MVT::i32;
1875}
1876
1877bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878 if (VT == MVT::f32)
1879 return X86ScalarSSEf32;
1880 else if (VT == MVT::f64)
1881 return X86ScalarSSEf64;
1882 return true;
1883}
1884
1885bool
1886X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887 unsigned,
1888 unsigned,
1889 bool *Fast) const {
1890 if (Fast)
1891 *Fast = Subtarget->isUnalignedMemAccessFast();
1892 return true;
1893}
1894
1895/// Return the entry encoding for a jump table in the
1896/// current function. The returned value is a member of the
1897/// MachineJumpTableInfo::JTEntryKind enum.
1898unsigned X86TargetLowering::getJumpTableEncoding() const {
1899 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900 // symbol.
1901 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902 Subtarget->isPICStyleGOT())
1903 return MachineJumpTableInfo::EK_Custom32;
1904
1905 // Otherwise, use the normal jump table encoding heuristics.
1906 return TargetLowering::getJumpTableEncoding();
1907}
1908
1909const MCExpr *
1910X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911 const MachineBasicBlock *MBB,
1912 unsigned uid,MCContext &Ctx) const{
1913 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&((MBB->getParent()->getTarget().getRelocationModel() ==
Reloc::PIC_ && Subtarget->isPICStyleGOT()) ? static_cast
<void> (0) : __assert_fail ("MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1914, __PRETTY_FUNCTION__))
1914 Subtarget->isPICStyleGOT())((MBB->getParent()->getTarget().getRelocationModel() ==
Reloc::PIC_ && Subtarget->isPICStyleGOT()) ? static_cast
<void> (0) : __assert_fail ("MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1914, __PRETTY_FUNCTION__))
;
1915 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916 // entries.
1917 return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919}
1920
1921/// Returns relocation base for the given PIC jumptable.
1922SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923 SelectionDAG &DAG) const {
1924 if (!Subtarget->is64Bit())
1925 // This doesn't have SDLoc associated with it, but is not really the
1926 // same as a Register.
1927 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928 return Table;
1929}
1930
1931/// This returns the relocation base for the given PIC jumptable,
1932/// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933const MCExpr *X86TargetLowering::
1934getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935 MCContext &Ctx) const {
1936 // X86-64 uses RIP relative addressing based on the jump table label.
1937 if (Subtarget->isPICStyleRIPRel())
1938 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940 // Otherwise, the reference is relative to the PIC base.
1941 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942}
1943
1944// FIXME: Why this routine is here? Move to RegInfo!
1945std::pair<const TargetRegisterClass*, uint8_t>
1946X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947 const TargetRegisterClass *RRC = nullptr;
1948 uint8_t Cost = 1;
1949 switch (VT.SimpleTy) {
1950 default:
1951 return TargetLowering::findRepresentativeClass(VT);
1952 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954 break;
1955 case MVT::x86mmx:
1956 RRC = &X86::VR64RegClass;
1957 break;
1958 case MVT::f32: case MVT::f64:
1959 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960 case MVT::v4f32: case MVT::v2f64:
1961 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962 case MVT::v4f64:
1963 RRC = &X86::VR128RegClass;
1964 break;
1965 }
1966 return std::make_pair(RRC, Cost);
1967}
1968
1969bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970 unsigned &Offset) const {
1971 if (!Subtarget->isTargetLinux())
1972 return false;
1973
1974 if (Subtarget->is64Bit()) {
1975 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976 Offset = 0x28;
1977 if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978 AddressSpace = 256;
1979 else
1980 AddressSpace = 257;
1981 } else {
1982 // %gs:0x14 on i386
1983 Offset = 0x14;
1984 AddressSpace = 256;
1985 }
1986 return true;
1987}
1988
1989bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990 unsigned DestAS) const {
1991 assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 1991, __PRETTY_FUNCTION__))
;
1992
1993 return SrcAS < 256 && DestAS < 256;
1994}
1995
1996//===----------------------------------------------------------------------===//
1997// Return Value Calling Convention Implementation
1998//===----------------------------------------------------------------------===//
1999
2000#include "X86GenCallingConv.inc"
2001
2002bool
2003X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004 MachineFunction &MF, bool isVarArg,
2005 const SmallVectorImpl<ISD::OutputArg> &Outs,
2006 LLVMContext &Context) const {
2007 SmallVector<CCValAssign, 16> RVLocs;
2008 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009 return CCInfo.CheckReturn(Outs, RetCC_X86);
2010}
2011
2012const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014 return ScratchRegs;
2015}
2016
2017SDValue
2018X86TargetLowering::LowerReturn(SDValue Chain,
2019 CallingConv::ID CallConv, bool isVarArg,
2020 const SmallVectorImpl<ISD::OutputArg> &Outs,
2021 const SmallVectorImpl<SDValue> &OutVals,
2022 SDLoc dl, SelectionDAG &DAG) const {
2023 MachineFunction &MF = DAG.getMachineFunction();
2024 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026 SmallVector<CCValAssign, 16> RVLocs;
2027 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030 SDValue Flag;
2031 SmallVector<SDValue, 6> RetOps;
2032 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033 // Operand #1 = Bytes To Pop
2034 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035 MVT::i16));
2036
2037 // Copy the result values into the output registers.
2038 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039 CCValAssign &VA = RVLocs[i];
2040 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2040, __PRETTY_FUNCTION__))
;
2041 SDValue ValToCopy = OutVals[i];
2042 EVT ValVT = ValToCopy.getValueType();
2043
2044 // Promote values to the appropriate types.
2045 if (VA.getLocInfo() == CCValAssign::SExt)
2046 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047 else if (VA.getLocInfo() == CCValAssign::ZExt)
2048 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049 else if (VA.getLocInfo() == CCValAssign::AExt)
2050 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051 else if (VA.getLocInfo() == CCValAssign::BCvt)
2052 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054 assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2055, __PRETTY_FUNCTION__))
2055 "Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2055, __PRETTY_FUNCTION__))
;
2056
2057 // If this is x86-64, and we disabled SSE, we can't return FP values,
2058 // or SSE or MMX vectors.
2059 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062 report_fatal_error("SSE register return with SSE disabled");
2063 }
2064 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2065 // llvm-gcc has never done it right and no one has noticed, so this
2066 // should be OK for now.
2067 if (ValVT == MVT::f64 &&
2068 (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069 report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072 // the RET instruction and handled by the FP Stackifier.
2073 if (VA.getLocReg() == X86::FP0 ||
2074 VA.getLocReg() == X86::FP1) {
2075 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076 // change the value to the FP stack register class.
2077 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079 RetOps.push_back(ValToCopy);
2080 // Don't emit a copytoreg.
2081 continue;
2082 }
2083
2084 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085 // which is returned in RAX / RDX.
2086 if (Subtarget->is64Bit()) {
2087 if (ValVT == MVT::x86mmx) {
2088 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091 ValToCopy);
2092 // If we don't have SSE2 available, convert to v4f32 so the generated
2093 // register is legal.
2094 if (!Subtarget->hasSSE2())
2095 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096 }
2097 }
2098 }
2099
2100 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101 Flag = Chain.getValue(1);
2102 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103 }
2104
2105 // The x86-64 ABIs require that for returning structs by value we copy
2106 // the sret argument into %rax/%eax (depending on ABI) for the return.
2107 // Win32 requires us to put the sret argument to %eax as well.
2108 // We saved the argument into a virtual register in the entry block,
2109 // so now we copy the value out and into %rax/%eax.
2110 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2111 (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2112 MachineFunction &MF = DAG.getMachineFunction();
2113 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2114 unsigned Reg = FuncInfo->getSRetReturnReg();
2115 assert(Reg &&((Reg && "SRetReturnReg should have been set in LowerFormalArguments()."
) ? static_cast<void> (0) : __assert_fail ("Reg && \"SRetReturnReg should have been set in LowerFormalArguments().\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2116, __PRETTY_FUNCTION__))
2116 "SRetReturnReg should have been set in LowerFormalArguments().")((Reg && "SRetReturnReg should have been set in LowerFormalArguments()."
) ? static_cast<void> (0) : __assert_fail ("Reg && \"SRetReturnReg should have been set in LowerFormalArguments().\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2116, __PRETTY_FUNCTION__))
;
2117 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2118
2119 unsigned RetValReg
2120 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2121 X86::RAX : X86::EAX;
2122 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2123 Flag = Chain.getValue(1);
2124
2125 // RAX/EAX now acts like a return value.
2126 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2127 }
2128
2129 RetOps[0] = Chain; // Update chain.
2130
2131 // Add the flag if we have it.
2132 if (Flag.getNode())
2133 RetOps.push_back(Flag);
2134
2135 return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2136}
2137
2138bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2139 if (N->getNumValues() != 1)
2140 return false;
2141 if (!N->hasNUsesOfValue(1, 0))
2142 return false;
2143
2144 SDValue TCChain = Chain;
2145 SDNode *Copy = *N->use_begin();
2146 if (Copy->getOpcode() == ISD::CopyToReg) {
2147 // If the copy has a glue operand, we conservatively assume it isn't safe to
2148 // perform a tail call.
2149 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2150 return false;
2151 TCChain = Copy->getOperand(0);
2152 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2153 return false;
2154
2155 bool HasRet = false;
2156 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2157 UI != UE; ++UI) {
2158 if (UI->getOpcode() != X86ISD::RET_FLAG)
2159 return false;
2160 // If we are returning more than one value, we can definitely
2161 // not make a tail call see PR19530
2162 if (UI->getNumOperands() > 4)
2163 return false;
2164 if (UI->getNumOperands() == 4 &&
2165 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2166 return false;
2167 HasRet = true;
2168 }
2169
2170 if (!HasRet)
2171 return false;
2172
2173 Chain = TCChain;
2174 return true;
2175}
2176
2177EVT
2178X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2179 ISD::NodeType ExtendKind) const {
2180 MVT ReturnMVT;
2181 // TODO: Is this also valid on 32-bit?
2182 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2183 ReturnMVT = MVT::i8;
2184 else
2185 ReturnMVT = MVT::i32;
2186
2187 EVT MinVT = getRegisterType(Context, ReturnMVT);
2188 return VT.bitsLT(MinVT) ? MinVT : VT;
2189}
2190
2191/// Lower the result values of a call into the
2192/// appropriate copies out of appropriate physical registers.
2193///
2194SDValue
2195X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2196 CallingConv::ID CallConv, bool isVarArg,
2197 const SmallVectorImpl<ISD::InputArg> &Ins,
2198 SDLoc dl, SelectionDAG &DAG,
2199 SmallVectorImpl<SDValue> &InVals) const {
2200
2201 // Assign locations to each value returned by this call.
2202 SmallVector<CCValAssign, 16> RVLocs;
2203 bool Is64Bit = Subtarget->is64Bit();
2204 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2205 *DAG.getContext());
2206 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2207
2208 // Copy all of the result registers out of their specified physreg.
2209 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2210 CCValAssign &VA = RVLocs[i];
2211 EVT CopyVT = VA.getValVT();
2212
2213 // If this is x86-64, and we disabled SSE, we can't return FP values
2214 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2215 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2216 report_fatal_error("SSE register return with SSE disabled");
2217 }
2218
2219 // If we prefer to use the value in xmm registers, copy it out as f80 and
2220 // use a truncate to move it from fp stack reg to xmm reg.
2221 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2222 isScalarFPTypeInSSEReg(VA.getValVT()))
2223 CopyVT = MVT::f80;
2224
2225 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2226 CopyVT, InFlag).getValue(1);
2227 SDValue Val = Chain.getValue(0);
2228
2229 if (CopyVT != VA.getValVT())
2230 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2231 // This truncation won't change the value.
2232 DAG.getIntPtrConstant(1));
2233
2234 InFlag = Chain.getValue(2);
2235 InVals.push_back(Val);
2236 }
2237
2238 return Chain;
2239}
2240
2241//===----------------------------------------------------------------------===//
2242// C & StdCall & Fast Calling Convention implementation
2243//===----------------------------------------------------------------------===//
2244// StdCall calling convention seems to be standard for many Windows' API
2245// routines and around. It differs from C calling convention just a little:
2246// callee should clean up the stack, not caller. Symbols should be also
2247// decorated in some fancy way :) It doesn't support any vector arguments.
2248// For info on fast calling convention see Fast Calling Convention (tail call)
2249// implementation LowerX86_32FastCCCallTo.
2250
2251/// CallIsStructReturn - Determines whether a call uses struct return
2252/// semantics.
2253enum StructReturnType {
2254 NotStructReturn,
2255 RegStructReturn,
2256 StackStructReturn
2257};
2258static StructReturnType
2259callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2260 if (Outs.empty())
2261 return NotStructReturn;
2262
2263 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2264 if (!Flags.isSRet())
2265 return NotStructReturn;
2266 if (Flags.isInReg())
2267 return RegStructReturn;
2268 return StackStructReturn;
2269}
2270
2271/// Determines whether a function uses struct return semantics.
2272static StructReturnType
2273argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2274 if (Ins.empty())
2275 return NotStructReturn;
2276
2277 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2278 if (!Flags.isSRet())
2279 return NotStructReturn;
2280 if (Flags.isInReg())
2281 return RegStructReturn;
2282 return StackStructReturn;
2283}
2284
2285/// Make a copy of an aggregate at address specified by "Src" to address
2286/// "Dst" with size and alignment information specified by the specific
2287/// parameter attribute. The copy will be passed as a byval function parameter.
2288static SDValue
2289CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2290 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2291 SDLoc dl) {
2292 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2293
2294 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2295 /*isVolatile*/false, /*AlwaysInline=*/true,
2296 MachinePointerInfo(), MachinePointerInfo());
2297}
2298
2299/// Return true if the calling convention is one that
2300/// supports tail call optimization.
2301static bool IsTailCallConvention(CallingConv::ID CC) {
2302 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2303 CC == CallingConv::HiPE);
2304}
2305
2306/// \brief Return true if the calling convention is a C calling convention.
2307static bool IsCCallConvention(CallingConv::ID CC) {
2308 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2309 CC == CallingConv::X86_64_SysV);
2310}
2311
2312bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2313 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2314 return false;
2315
2316 CallSite CS(CI);
2317 CallingConv::ID CalleeCC = CS.getCallingConv();
2318 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2319 return false;
2320
2321 return true;
2322}
2323
2324/// Return true if the function is being made into
2325/// a tailcall target by changing its ABI.
2326static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2327 bool GuaranteedTailCallOpt) {
2328 return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2329}
2330
2331SDValue
2332X86TargetLowering::LowerMemArgument(SDValue Chain,
2333 CallingConv::ID CallConv,
2334 const SmallVectorImpl<ISD::InputArg> &Ins,
2335 SDLoc dl, SelectionDAG &DAG,
2336 const CCValAssign &VA,
2337 MachineFrameInfo *MFI,
2338 unsigned i) const {
2339 // Create the nodes corresponding to a load from this parameter slot.
2340 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2341 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2342 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2343 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2344 EVT ValVT;
2345
2346 // If value is passed by pointer we have address passed instead of the value
2347 // itself.
2348 if (VA.getLocInfo() == CCValAssign::Indirect)
2349 ValVT = VA.getLocVT();
2350 else
2351 ValVT = VA.getValVT();
2352
2353 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2354 // changed with more analysis.
2355 // In case of tail call optimization mark all arguments mutable. Since they
2356 // could be overwritten by lowering of arguments in case of a tail call.
2357 if (Flags.isByVal()) {
2358 unsigned Bytes = Flags.getByValSize();
2359 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2360 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2361 return DAG.getFrameIndex(FI, getPointerTy());
2362 } else {
2363 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2364 VA.getLocMemOffset(), isImmutable);
2365 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2366 return DAG.getLoad(ValVT, dl, Chain, FIN,
2367 MachinePointerInfo::getFixedStack(FI),
2368 false, false, false, 0);
2369 }
2370}
2371
2372// FIXME: Get this from tablegen.
2373static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2374 const X86Subtarget *Subtarget) {
2375 assert(Subtarget->is64Bit())((Subtarget->is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget->is64Bit()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2375, __PRETTY_FUNCTION__))
;
2376
2377 if (Subtarget->isCallingConvWin64(CallConv)) {
2378 static const MCPhysReg GPR64ArgRegsWin64[] = {
2379 X86::RCX, X86::RDX, X86::R8, X86::R9
2380 };
2381 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2382 }
2383
2384 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2385 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2386 };
2387 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2388}
2389
2390// FIXME: Get this from tablegen.
2391static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2392 CallingConv::ID CallConv,
2393 const X86Subtarget *Subtarget) {
2394 assert(Subtarget->is64Bit())((Subtarget->is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget->is64Bit()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2394, __PRETTY_FUNCTION__))
;
2395 if (Subtarget->isCallingConvWin64(CallConv)) {
2396 // The XMM registers which might contain var arg parameters are shadowed
2397 // in their paired GPR. So we only need to save the GPR to their home
2398 // slots.
2399 // TODO: __vectorcall will change this.
2400 return None;
2401 }
2402
2403 const Function *Fn = MF.getFunction();
2404 bool NoImplicitFloatOps = Fn->getAttributes().
2405 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2406 assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&((!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2407, __PRETTY_FUNCTION__))
2407 "SSE register cannot be used when SSE is disabled!")((!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2407, __PRETTY_FUNCTION__))
;
2408 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2409 !Subtarget->hasSSE1())
2410 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2411 // registers.
2412 return None;
2413
2414 static const MCPhysReg XMMArgRegs64Bit[] = {
2415 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2416 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2417 };
2418 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2419}
2420
2421SDValue
2422X86TargetLowering::LowerFormalArguments(SDValue Chain,
2423 CallingConv::ID CallConv,
2424 bool isVarArg,
2425 const SmallVectorImpl<ISD::InputArg> &Ins,
2426 SDLoc dl,
2427 SelectionDAG &DAG,
2428 SmallVectorImpl<SDValue> &InVals)
2429 const {
2430 MachineFunction &MF = DAG.getMachineFunction();
2431 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2432
2433 const Function* Fn = MF.getFunction();
2434 if (Fn->hasExternalLinkage() &&
2435 Subtarget->isTargetCygMing() &&
2436 Fn->getName() == "main")
2437 FuncInfo->setForceFramePointer(true);
2438
2439 MachineFrameInfo *MFI = MF.getFrameInfo();
2440 bool Is64Bit = Subtarget->is64Bit();
2441 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2442
2443 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2444, __PRETTY_FUNCTION__))
2444 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2444, __PRETTY_FUNCTION__))
;
2445
2446 // Assign locations to all of the incoming arguments.
2447 SmallVector<CCValAssign, 16> ArgLocs;
2448 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2449
2450 // Allocate shadow area for Win64
2451 if (IsWin64)
2452 CCInfo.AllocateStack(32, 8);
2453
2454 CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2455
2456 unsigned LastVal = ~0U;
2457 SDValue ArgValue;
2458 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2459 CCValAssign &VA = ArgLocs[i];
2460 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2461 // places.
2462 assert(VA.getValNo() != LastVal &&((VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"
) ? static_cast<void> (0) : __assert_fail ("VA.getValNo() != LastVal && \"Don't support value assigned to multiple locs yet\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2463, __PRETTY_FUNCTION__))
2463 "Don't support value assigned to multiple locs yet")((VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"
) ? static_cast<void> (0) : __assert_fail ("VA.getValNo() != LastVal && \"Don't support value assigned to multiple locs yet\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2463, __PRETTY_FUNCTION__))
;
2464 (void)LastVal;
2465 LastVal = VA.getValNo();
2466
2467 if (VA.isRegLoc()) {
2468 EVT RegVT = VA.getLocVT();
2469 const TargetRegisterClass *RC;
2470 if (RegVT == MVT::i32)
2471 RC = &X86::GR32RegClass;
2472 else if (Is64Bit && RegVT == MVT::i64)
2473 RC = &X86::GR64RegClass;
2474 else if (RegVT == MVT::f32)
2475 RC = &X86::FR32RegClass;
2476 else if (RegVT == MVT::f64)
2477 RC = &X86::FR64RegClass;
2478 else if (RegVT.is512BitVector())
2479 RC = &X86::VR512RegClass;
2480 else if (RegVT.is256BitVector())
2481 RC = &X86::VR256RegClass;
2482 else if (RegVT.is128BitVector())
2483 RC = &X86::VR128RegClass;
2484 else if (RegVT == MVT::x86mmx)
2485 RC = &X86::VR64RegClass;
2486 else if (RegVT == MVT::i1)
2487 RC = &X86::VK1RegClass;
2488 else if (RegVT == MVT::v8i1)
2489 RC = &X86::VK8RegClass;
2490 else if (RegVT == MVT::v16i1)
2491 RC = &X86::VK16RegClass;
2492 else if (RegVT == MVT::v32i1)
2493 RC = &X86::VK32RegClass;
2494 else if (RegVT == MVT::v64i1)
2495 RC = &X86::VK64RegClass;
2496 else
2497 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2497)
;
2498
2499 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2500 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2501
2502 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2503 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2504 // right size.
2505 if (VA.getLocInfo() == CCValAssign::SExt)
2506 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2507 DAG.getValueType(VA.getValVT()));
2508 else if (VA.getLocInfo() == CCValAssign::ZExt)
2509 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2510 DAG.getValueType(VA.getValVT()));
2511 else if (VA.getLocInfo() == CCValAssign::BCvt)
2512 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2513
2514 if (VA.isExtInLoc()) {
2515 // Handle MMX values passed in XMM regs.
2516 if (RegVT.isVector())
2517 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2518 else
2519 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2520 }
2521 } else {
2522 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2522, __PRETTY_FUNCTION__))
;
2523 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2524 }
2525
2526 // If value is passed via pointer - do a load.
2527 if (VA.getLocInfo() == CCValAssign::Indirect)
2528 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2529 MachinePointerInfo(), false, false, false, 0);
2530
2531 InVals.push_back(ArgValue);
2532 }
2533
2534 if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2535 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2536 // The x86-64 ABIs require that for returning structs by value we copy
2537 // the sret argument into %rax/%eax (depending on ABI) for the return.
2538 // Win32 requires us to put the sret argument to %eax as well.
2539 // Save the argument into a virtual register so that we can access it
2540 // from the return points.
2541 if (Ins[i].Flags.isSRet()) {
2542 unsigned Reg = FuncInfo->getSRetReturnReg();
2543 if (!Reg) {
2544 MVT PtrTy = getPointerTy();
2545 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2546 FuncInfo->setSRetReturnReg(Reg);
2547 }
2548 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2549 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2550 break;
2551 }
2552 }
2553 }
2554
2555 unsigned StackSize = CCInfo.getNextStackOffset();
2556 // Align stack specially for tail calls.
2557 if (FuncIsMadeTailCallSafe(CallConv,
2558 MF.getTarget().Options.GuaranteedTailCallOpt))
2559 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2560
2561 // If the function takes variable number of arguments, make a frame index for
2562 // the start of the first vararg value... for expansion of llvm.va_start. We
2563 // can skip this if there are no va_start calls.
2564 if (MFI->hasVAStart() &&
2565 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2566 CallConv != CallingConv::X86_ThisCall))) {
2567 FuncInfo->setVarArgsFrameIndex(
2568 MFI->CreateFixedObject(1, StackSize, true));
2569 }
2570
2571 // Figure out if XMM registers are in use.
2572 assert(!(MF.getTarget().Options.UseSoftFloat &&((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))
2573 Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))
2574 Attribute::NoImplicitFloat)) &&((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))
2575 "SSE register cannot be used when SSE is disabled!")((!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes
().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat
)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(MF.getTarget().Options.UseSoftFloat && Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2575, __PRETTY_FUNCTION__))
;
2576
2577 // 64-bit calling conventions support varargs and register parameters, so we
2578 // have to do extra work to spill them in the prologue.
2579 if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2580 // Find the first unallocated argument registers.
2581 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2582 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2583 unsigned NumIntRegs =
2584 CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2585 unsigned NumXMMRegs =
2586 CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2587 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&((!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!") ? static_cast
<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget->hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2588, __PRETTY_FUNCTION__))
2588 "SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!") ? static_cast
<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget->hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2588, __PRETTY_FUNCTION__))
;
2589
2590 // Gather all the live in physical registers.
2591 SmallVector<SDValue, 6> LiveGPRs;
2592 SmallVector<SDValue, 8> LiveXMMRegs;
2593 SDValue ALVal;
2594 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2595 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2596 LiveGPRs.push_back(
2597 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2598 }
2599 if (!ArgXMMs.empty()) {
2600 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2601 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2602 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2603 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2604 LiveXMMRegs.push_back(
2605 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2606 }
2607 }
2608
2609 if (IsWin64) {
2610 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
2611 // Get to the caller-allocated home save location. Add 8 to account
2612 // for the return address.
2613 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2614 FuncInfo->setRegSaveFrameIndex(
2615 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2616 // Fixup to set vararg frame on shadow area (4 x i64).
2617 if (NumIntRegs < 4)
2618 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2619 } else {
2620 // For X86-64, if there are vararg parameters that are passed via
2621 // registers, then we must store them to their spots on the stack so
2622 // they may be loaded by deferencing the result of va_next.
2623 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2624 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2625 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2626 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2627 }
2628
2629 // Store the integer parameter registers.
2630 SmallVector<SDValue, 8> MemOps;
2631 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2632 getPointerTy());
2633 unsigned Offset = FuncInfo->getVarArgsGPOffset();
2634 for (SDValue Val : LiveGPRs) {
2635 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2636 DAG.getIntPtrConstant(Offset));
2637 SDValue Store =
2638 DAG.getStore(Val.getValue(1), dl, Val, FIN,
2639 MachinePointerInfo::getFixedStack(
2640 FuncInfo->getRegSaveFrameIndex(), Offset),
2641 false, false, 0);
2642 MemOps.push_back(Store);
2643 Offset += 8;
2644 }
2645
2646 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2647 // Now store the XMM (fp + vector) parameter registers.
2648 SmallVector<SDValue, 12> SaveXMMOps;
2649 SaveXMMOps.push_back(Chain);
2650 SaveXMMOps.push_back(ALVal);
2651 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2652 FuncInfo->getRegSaveFrameIndex()));
2653 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2654 FuncInfo->getVarArgsFPOffset()));
2655 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2656 LiveXMMRegs.end());
2657 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2658 MVT::Other, SaveXMMOps));
2659 }
2660
2661 if (!MemOps.empty())
2662 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2663 }
2664
2665 if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2666 // Find the largest legal vector type.
2667 MVT VecVT = MVT::Other;
2668 // FIXME: Only some x86_32 calling conventions support AVX512.
2669 if (Subtarget->hasAVX512() &&
2670 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2671 CallConv == CallingConv::Intel_OCL_BI)))
2672 VecVT = MVT::v16f32;
2673 else if (Subtarget->hasAVX())
2674 VecVT = MVT::v8f32;
2675 else if (Subtarget->hasSSE2())
2676 VecVT = MVT::v4f32;
2677
2678 // We forward some GPRs and some vector types.
2679 SmallVector<MVT, 2> RegParmTypes;
2680 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2681 RegParmTypes.push_back(IntVT);
2682 if (VecVT != MVT::Other)
2683 RegParmTypes.push_back(VecVT);
2684
2685 // Compute the set of forwarded registers. The rest are scratch.
2686 SmallVectorImpl<ForwardedRegister> &Forwards =
2687 FuncInfo->getForwardedMustTailRegParms();
2688 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2689
2690 // Conservatively forward AL on x86_64, since it might be used for varargs.
2691 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2692 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2693 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2694 }
2695
2696 // Copy all forwards from physical to virtual registers.
2697 for (ForwardedRegister &F : Forwards) {
2698 // FIXME: Can we use a less constrained schedule?
2699 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2700 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2701 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2702 }
2703 }
2704
2705 // Some CCs need callee pop.
2706 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2707 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2708 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2709 } else {
2710 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2711 // If this is an sret function, the return should pop the hidden pointer.
2712 if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2713 !Subtarget->getTargetTriple().isOSMSVCRT() &&
2714 argsAreStructReturn(Ins) == StackStructReturn)
2715 FuncInfo->setBytesToPopOnReturn(4);
2716 }
2717
2718 if (!Is64Bit) {
2719 // RegSaveFrameIndex is X86-64 only.
2720 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2721 if (CallConv == CallingConv::X86_FastCall ||
2722 CallConv == CallingConv::X86_ThisCall)
2723 // fastcc functions can't have varargs.
2724 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2725 }
2726
2727 FuncInfo->setArgumentStackSize(StackSize);
2728
2729 return Chain;
2730}
2731
2732SDValue
2733X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2734 SDValue StackPtr, SDValue Arg,
2735 SDLoc dl, SelectionDAG &DAG,
2736 const CCValAssign &VA,
2737 ISD::ArgFlagsTy Flags) const {
2738 unsigned LocMemOffset = VA.getLocMemOffset();
2739 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2740 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2741 if (Flags.isByVal())
2742 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2743
2744 return DAG.getStore(Chain, dl, Arg, PtrOff,
2745 MachinePointerInfo::getStack(LocMemOffset),
2746 false, false, 0);
2747}
2748
2749/// Emit a load of return address if tail call
2750/// optimization is performed and it is required.
2751SDValue
2752X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2753 SDValue &OutRetAddr, SDValue Chain,
2754 bool IsTailCall, bool Is64Bit,
2755 int FPDiff, SDLoc dl) const {
2756 // Adjust the Return address stack slot.
2757 EVT VT = getPointerTy();
2758 OutRetAddr = getReturnAddressFrameIndex(DAG);
2759
2760 // Load the "old" Return address.
2761 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2762 false, false, false, 0);
2763 return SDValue(OutRetAddr.getNode(), 1);
2764}
2765
2766/// Emit a store of the return address if tail call
2767/// optimization is performed and it is required (FPDiff!=0).
2768static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2769 SDValue Chain, SDValue RetAddrFrIdx,
2770 EVT PtrVT, unsigned SlotSize,
2771 int FPDiff, SDLoc dl) {
2772 // Store the return address to the appropriate stack slot.
2773 if (!FPDiff) return Chain;
2774 // Calculate the new stack slot for the return address.
2775 int NewReturnAddrFI =
2776 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2777 false);
2778 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2779 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2780 MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2781 false, false, 0);
2782 return Chain;
2783}
2784
2785SDValue
2786X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2787 SmallVectorImpl<SDValue> &InVals) const {
2788 SelectionDAG &DAG = CLI.DAG;
2789 SDLoc &dl = CLI.DL;
2790 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2791 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2792 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2793 SDValue Chain = CLI.Chain;
2794 SDValue Callee = CLI.Callee;
2795 CallingConv::ID CallConv = CLI.CallConv;
2796 bool &isTailCall = CLI.IsTailCall;
2797 bool isVarArg = CLI.IsVarArg;
2798
2799 MachineFunction &MF = DAG.getMachineFunction();
2800 bool Is64Bit = Subtarget->is64Bit();
2801 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2802 StructReturnType SR = callIsStructReturn(Outs);
2803 bool IsSibcall = false;
2804 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2805
2806 if (MF.getTarget().Options.DisableTailCalls)
2807 isTailCall = false;
2808
2809 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2810 if (IsMustTail) {
2811 // Force this to be a tail call. The verifier rules are enough to ensure
2812 // that we can lower this successfully without moving the return address
2813 // around.
2814 isTailCall = true;
2815 } else if (isTailCall) {
2816 // Check if it's really possible to do a tail call.
2817 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2818 isVarArg, SR != NotStructReturn,
2819 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2820 Outs, OutVals, Ins, DAG);
2821
2822 // Sibcalls are automatically detected tailcalls which do not require
2823 // ABI changes.
2824 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2825 IsSibcall = true;
2826
2827 if (isTailCall)
2828 ++NumTailCalls;
2829 }
2830
2831 assert(!(isVarArg && IsTailCallConvention(CallConv)) &&((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2832, __PRETTY_FUNCTION__))
2832 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && IsTailCallConvention(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && IsTailCallConvention(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2832, __PRETTY_FUNCTION__))
;
2833
2834 // Analyze operands of the call, assigning locations to each operand.
2835 SmallVector<CCValAssign, 16> ArgLocs;
2836 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2837
2838 // Allocate shadow area for Win64
2839 if (IsWin64)
2840 CCInfo.AllocateStack(32, 8);
2841
2842 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2843
2844 // Get a count of how many bytes are to be pushed on the stack.
2845 unsigned NumBytes = CCInfo.getNextStackOffset();
2846 if (IsSibcall)
2847 // This is a sibcall. The memory operands are available in caller's
2848 // own caller's stack.
2849 NumBytes = 0;
2850 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2851 IsTailCallConvention(CallConv))
2852 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2853
2854 int FPDiff = 0;
2855 if (isTailCall && !IsSibcall && !IsMustTail) {
2856 // Lower arguments at fp - stackoffset + fpdiff.
2857 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2858
2859 FPDiff = NumBytesCallerPushed - NumBytes;
2860
2861 // Set the delta of movement of the returnaddr stackslot.
2862 // But only set if delta is greater than previous delta.
2863 if (FPDiff < X86Info->getTCReturnAddrDelta())
2864 X86Info->setTCReturnAddrDelta(FPDiff);
2865 }
2866
2867 unsigned NumBytesToPush = NumBytes;
2868 unsigned NumBytesToPop = NumBytes;
2869
2870 // If we have an inalloca argument, all stack space has already been allocated
2871 // for us and be right at the top of the stack. We don't support multiple
2872 // arguments passed in memory when using inalloca.
2873 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2874 NumBytesToPush = 0;
2875 if (!ArgLocs.back().isMemLoc())
2876 report_fatal_error("cannot use inalloca attribute on a register "
2877 "parameter");
2878 if (ArgLocs.back().getLocMemOffset() != 0)
2879 report_fatal_error("any parameter with the inalloca attribute must be "
2880 "the only memory argument");
2881 }
2882
2883 if (!IsSibcall)
2884 Chain = DAG.getCALLSEQ_START(
2885 Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2886
2887 SDValue RetAddrFrIdx;
2888 // Load return address for tail calls.
2889 if (isTailCall && FPDiff)
2890 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2891 Is64Bit, FPDiff, dl);
2892
2893 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2894 SmallVector<SDValue, 8> MemOpChains;
2895 SDValue StackPtr;
2896
2897 // Walk the register/memloc assignments, inserting copies/loads. In the case
2898 // of tail call optimization arguments are handle later.
2899 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
2900 DAG.getSubtarget().getRegisterInfo());
2901 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902 // Skip inalloca arguments, they have already been written.
2903 ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904 if (Flags.isInAlloca())
2905 continue;
2906
2907 CCValAssign &VA = ArgLocs[i];
2908 EVT RegVT = VA.getLocVT();
2909 SDValue Arg = OutVals[i];
2910 bool isByVal = Flags.isByVal();
2911
2912 // Promote the value if needed.
2913 switch (VA.getLocInfo()) {
2914 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2914)
;
2915 case CCValAssign::Full: break;
2916 case CCValAssign::SExt:
2917 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2918 break;
2919 case CCValAssign::ZExt:
2920 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2921 break;
2922 case CCValAssign::AExt:
2923 if (RegVT.is128BitVector()) {
2924 // Special case: passing MMX values in XMM registers.
2925 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2928 } else
2929 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2930 break;
2931 case CCValAssign::BCvt:
2932 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2933 break;
2934 case CCValAssign::Indirect: {
2935 // Store the argument.
2936 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939 MachinePointerInfo::getFixedStack(FI),
2940 false, false, 0);
2941 Arg = SpillSlot;
2942 break;
2943 }
2944 }
2945
2946 if (VA.isRegLoc()) {
2947 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948 if (isVarArg && IsWin64) {
2949 // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950 // shadow reg if callee is a varargs function.
2951 unsigned ShadowReg = 0;
2952 switch (VA.getLocReg()) {
2953 case X86::XMM0: ShadowReg = X86::RCX; break;
2954 case X86::XMM1: ShadowReg = X86::RDX; break;
2955 case X86::XMM2: ShadowReg = X86::R8; break;
2956 case X86::XMM3: ShadowReg = X86::R9; break;
2957 }
2958 if (ShadowReg)
2959 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2960 }
2961 } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 2962, __PRETTY_FUNCTION__))
;
2963 if (!StackPtr.getNode())
2964 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2965 getPointerTy());
2966 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967 dl, DAG, VA, Flags));
2968 }
2969 }
2970
2971 if (!MemOpChains.empty())
2972 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2973
2974 if (Subtarget->isPICStyleGOT()) {
2975 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2976 // GOT pointer.
2977 if (!isTailCall) {
2978 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2980 } else {
2981 // If we are tail calling and generating PIC/GOT style code load the
2982 // address of the callee into ECX. The value in ecx is used as target of
2983 // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984 // for tail calls on PIC/GOT architectures. Normally we would just put the
2985 // address of GOT into ebx and then call target@PLT. But for tail calls
2986 // ebx would be restored (since ebx is callee saved) before jumping to the
2987 // target@PLT.
2988
2989 // Note: The actual moving to ECX is done further down.
2990 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991 if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992 !G->getGlobal()->hasProtectedVisibility())
2993 Callee = LowerGlobalAddress(Callee, DAG);
2994 else if (isa<ExternalSymbolSDNode>(Callee))
2995 Callee = LowerExternalSymbol(Callee, DAG);
2996 }
2997 }
2998
2999 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000 // From AMD64 ABI document:
3001 // For calls that may call functions that use varargs or stdargs
3002 // (prototype-less calls or calls to functions containing ellipsis (...) in
3003 // the declaration) %al is used as hidden argument to specify the number
3004 // of SSE registers used. The contents of %al do not need to match exactly
3005 // the number of registers, but must be an ubound on the number of SSE
3006 // registers used and is in the range 0 - 8 inclusive.
3007
3008 // Count the number of XMM registers allocated.
3009 static const MCPhysReg XMMArgRegs[] = {
3010 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3012 };
3013 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014 assert((Subtarget->hasSSE1() || !NumXMMRegs)(((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget->hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3015, __PRETTY_FUNCTION__))
3015 && "SSE registers cannot be used when SSE is disabled")(((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget->hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3015, __PRETTY_FUNCTION__))
;
3016
3017 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018 DAG.getConstant(NumXMMRegs, MVT::i8)));
3019 }
3020
3021 if (isVarArg && IsMustTail) {
3022 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023 for (const auto &F : Forwards) {
3024 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3026 }
3027 }
3028
3029 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3030 // don't need this because the eligibility check rejects calls that require
3031 // shuffling arguments passed in memory.
3032 if (!IsSibcall && isTailCall) {
3033 // Force all the incoming stack arguments to be loaded from the stack
3034 // before any new outgoing arguments are stored to the stack, because the
3035 // outgoing stack slots may alias the incoming argument stack slots, and
3036 // the alias isn't otherwise explicit. This is slightly more conservative
3037 // than necessary, because it means that each store effectively depends
3038 // on every argument instead of just those arguments it would clobber.
3039 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3040
3041 SmallVector<SDValue, 8> MemOpChains2;
3042 SDValue FIN;
3043 int FI = 0;
3044 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045 CCValAssign &VA = ArgLocs[i];
3046 if (VA.isRegLoc())
3047 continue;
3048 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3048, __PRETTY_FUNCTION__))
;
3049 SDValue Arg = OutVals[i];
3050 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051 // Skip inalloca arguments. They don't require any work.
3052 if (Flags.isInAlloca())
3053 continue;
3054 // Create frame index.
3055 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058 FIN = DAG.getFrameIndex(FI, getPointerTy());
3059
3060 if (Flags.isByVal()) {
3061 // Copy relative to framepointer.
3062 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063 if (!StackPtr.getNode())
3064 StackPtr = DAG.getCopyFromReg(Chain, dl,
3065 RegInfo->getStackRegister(),
3066 getPointerTy());
3067 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3068
3069 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3070 ArgChain,
3071 Flags, DAG, dl));
3072 } else {
3073 // Store relative to framepointer.
3074 MemOpChains2.push_back(
3075 DAG.getStore(ArgChain, dl, Arg, FIN,
3076 MachinePointerInfo::getFixedStack(FI),
3077 false, false, 0));
3078 }
3079 }
3080
3081 if (!MemOpChains2.empty())
3082 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3083
3084 // Store the return address to the appropriate stack slot.
3085 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086 getPointerTy(), RegInfo->getSlotSize(),
3087 FPDiff, dl);
3088 }
3089
3090 // Build a sequence of copy-to-reg nodes chained together with token chain
3091 // and flag operands which copy the outgoing args into registers.
3092 SDValue InFlag;
3093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095 RegsToPass[i].second, InFlag);
3096 InFlag = Chain.getValue(1);
3097 }
3098
3099 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3100, __PRETTY_FUNCTION__))
;
3101 // In the 64-bit large code model, we have to make all calls
3102 // through a register, since the call instruction's 32-bit
3103 // pc-relative offset may not be large enough to hold the whole
3104 // address.
3105 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106 // If the callee is a GlobalAddress node (quite common, every direct call
3107 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3108 // it.
3109 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3110
3111 // We should use extra load for direct calls to dllimported functions in
3112 // non-JIT mode.
3113 const GlobalValue *GV = G->getGlobal();
3114 if (!GV->hasDLLImportStorageClass()) {
3115 unsigned char OpFlags = 0;
3116 bool ExtraLoad = false;
3117 unsigned WrapperKind = ISD::DELETED_NODE;
3118
3119 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120 // external symbols most go through the PLT in PIC mode. If the symbol
3121 // has hidden or protected visibility, or if it is static or local, then
3122 // we don't need to use the PLT - we can directly call it.
3123 if (Subtarget->isTargetELF() &&
3124 DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126 OpFlags = X86II::MO_PLT;
3127 } else if (Subtarget->isPICStyleStubAny() &&
3128 (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129 (!Subtarget->getTargetTriple().isMacOSX() ||
3130 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131 // PC-relative references to external symbols should go through $stub,
3132 // unless we're building with the leopard linker or later, which
3133 // automatically synthesizes these stubs.
3134 OpFlags = X86II::MO_DARWIN_STUB;
3135 } else if (Subtarget->isPICStyleRIPRel() &&
3136 isa<Function>(GV) &&
3137 cast<Function>(GV)->getAttributes().
3138 hasAttribute(AttributeSet::FunctionIndex,
3139 Attribute::NonLazyBind)) {
3140 // If the function is marked as non-lazy, generate an indirect call
3141 // which loads from the GOT directly. This avoids runtime overhead
3142 // at the cost of eager binding (and one extra byte of encoding).
3143 OpFlags = X86II::MO_GOTPCREL;
3144 WrapperKind = X86ISD::WrapperRIP;
3145 ExtraLoad = true;
3146 }
3147
3148 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149 G->getOffset(), OpFlags);
3150
3151 // Add a wrapper if needed.
3152 if (WrapperKind != ISD::DELETED_NODE)
3153 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154 // Add extra indirection if needed.
3155 if (ExtraLoad)
3156 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157 MachinePointerInfo::getGOT(),
3158 false, false, false, 0);
3159 }
3160 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161 unsigned char OpFlags = 0;
3162
3163 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164 // external symbols should go through the PLT.
3165 if (Subtarget->isTargetELF() &&
3166 DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167 OpFlags = X86II::MO_PLT;
3168 } else if (Subtarget->isPICStyleStubAny() &&
3169 (!Subtarget->getTargetTriple().isMacOSX() ||
3170 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171 // PC-relative references to external symbols should go through $stub,
3172 // unless we're building with the leopard linker or later, which
3173 // automatically synthesizes these stubs.
3174 OpFlags = X86II::MO_DARWIN_STUB;
3175 }
3176
3177 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3178 OpFlags);
3179 } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
3180 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3181 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3182 }
3183
3184 // Returns a chain & a flag for retval copy to use.
3185 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3186 SmallVector<SDValue, 8> Ops;
3187
3188 if (!IsSibcall && isTailCall) {
3189 Chain = DAG.getCALLSEQ_END(Chain,
3190 DAG.getIntPtrConstant(NumBytesToPop, true),
3191 DAG.getIntPtrConstant(0, true), InFlag, dl);
3192 InFlag = Chain.getValue(1);
3193 }
3194
3195 Ops.push_back(Chain);
3196 Ops.push_back(Callee);
3197
3198 if (isTailCall)
3199 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3200
3201 // Add argument registers to the end of the list so that they are known live
3202 // into the call.
3203 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3204 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3205 RegsToPass[i].second.getValueType()));
3206
3207 // Add a register mask operand representing the call-preserved registers.
3208 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
3209 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3210 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3210, __PRETTY_FUNCTION__))
;
3211 Ops.push_back(DAG.getRegisterMask(Mask));
3212
3213 if (InFlag.getNode())
3214 Ops.push_back(InFlag);
3215
3216 if (isTailCall) {
3217 // We used to do:
3218 //// If this is the first return lowered for this function, add the regs
3219 //// to the liveout set for the function.
3220 // This isn't right, although it's probably harmless on x86; liveouts
3221 // should be computed from returns not tail calls. Consider a void
3222 // function making a tail call to a function returning int.
3223 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3224 }
3225
3226 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3227 InFlag = Chain.getValue(1);
3228
3229 // Create the CALLSEQ_END node.
3230 unsigned NumBytesForCalleeToPop;
3231 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3232 DAG.getTarget().Options.GuaranteedTailCallOpt))
3233 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3234 else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3235 !Subtarget->getTargetTriple().isOSMSVCRT() &&
3236 SR == StackStructReturn)
3237 // If this is a call to a struct-return function, the callee
3238 // pops the hidden struct pointer, so we have to push it back.
3239 // This is common for Darwin/X86, Linux & Mingw32 targets.
3240 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3241 NumBytesForCalleeToPop = 4;
3242 else
3243 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3244
3245 // Returns a flag for retval copy to use.
3246 if (!IsSibcall) {
3247 Chain = DAG.getCALLSEQ_END(Chain,
3248 DAG.getIntPtrConstant(NumBytesToPop, true),
3249 DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3250 true),
3251 InFlag, dl);
3252 InFlag = Chain.getValue(1);
3253 }
3254
3255 // Handle result values, copying them out of physregs into vregs that we
3256 // return.
3257 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3258 Ins, dl, DAG, InVals);
3259}
3260
3261//===----------------------------------------------------------------------===//
3262// Fast Calling Convention (tail call) implementation
3263//===----------------------------------------------------------------------===//
3264
3265// Like std call, callee cleans arguments, convention except that ECX is
3266// reserved for storing the tail called function address. Only 2 registers are
3267// free for argument passing (inreg). Tail call optimization is performed
3268// provided:
3269// * tailcallopt is enabled
3270// * caller/callee are fastcc
3271// On X86_64 architecture with GOT-style position independent code only local
3272// (within module) calls are supported at the moment.
3273// To keep the stack aligned according to platform abi the function
3274// GetAlignedArgumentStackSize ensures that argument delta is always multiples
3275// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3276// If a tail called function callee has more arguments than the caller the
3277// caller needs to make sure that there is room to move the RETADDR to. This is
3278// achieved by reserving an area the size of the argument delta right after the
3279// original RETADDR, but before the saved framepointer or the spilled registers
3280// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3281// stack layout:
3282// arg1
3283// arg2
3284// RETADDR
3285// [ new RETADDR
3286// move area ]
3287// (possible EBP)
3288// ESI
3289// EDI
3290// local1 ..
3291
3292/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3293/// for a 16 byte align requirement.
3294unsigned
3295X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3296 SelectionDAG& DAG) const {
3297 MachineFunction &MF = DAG.getMachineFunction();
3298 const TargetMachine &TM = MF.getTarget();
3299 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3300 TM.getSubtargetImpl()->getRegisterInfo());
3301 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
3302 unsigned StackAlignment = TFI.getStackAlignment();
3303 uint64_t AlignMask = StackAlignment - 1;
3304 int64_t Offset = StackSize;
3305 unsigned SlotSize = RegInfo->getSlotSize();
3306 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3307 // Number smaller than 12 so just add the difference.
3308 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3309 } else {
3310 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3311 Offset = ((~AlignMask) & Offset) + StackAlignment +
3312 (StackAlignment-SlotSize);
3313 }
3314 return Offset;
3315}
3316
3317/// MatchingStackOffset - Return true if the given stack call argument is
3318/// already available in the same position (relatively) of the caller's
3319/// incoming argument stack.
3320static
3321bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3322 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3323 const X86InstrInfo *TII) {
3324 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3325 int FI = INT_MAX2147483647;
3326 if (Arg.getOpcode() == ISD::CopyFromReg) {
3327 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3328 if (!TargetRegisterInfo::isVirtualRegister(VR))
3329 return false;
3330 MachineInstr *Def = MRI->getVRegDef(VR);
3331 if (!Def)
3332 return false;
3333 if (!Flags.isByVal()) {
3334 if (!TII->isLoadFromStackSlot(Def, FI))
3335 return false;
3336 } else {
3337 unsigned Opcode = Def->getOpcode();
3338 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3339 Opcode == X86::LEA64_32r) &&
3340 Def->getOperand(1).isFI()) {
3341 FI = Def->getOperand(1).getIndex();
3342 Bytes = Flags.getByValSize();
3343 } else
3344 return false;
3345 }
3346 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3347 if (Flags.isByVal())
3348 // ByVal argument is passed in as a pointer but it's now being
3349 // dereferenced. e.g.
3350 // define @foo(%struct.X* %A) {
3351 // tail call @bar(%struct.X* byval %A)
3352 // }
3353 return false;
3354 SDValue Ptr = Ld->getBasePtr();
3355 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3356 if (!FINode)
3357 return false;
3358 FI = FINode->getIndex();
3359 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3360 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3361 FI = FINode->getIndex();
3362 Bytes = Flags.getByValSize();
3363 } else
3364 return false;
3365
3366 assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != 2147483647", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3366, __PRETTY_FUNCTION__))
;
3367 if (!MFI->isFixedObjectIndex(FI))
3368 return false;
3369 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3370}
3371
3372/// IsEligibleForTailCallOptimization - Check whether the call is eligible
3373/// for tail call optimization. Targets which want to do tail call
3374/// optimization should implement this function.
3375bool
3376X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3377 CallingConv::ID CalleeCC,
3378 bool isVarArg,
3379 bool isCalleeStructRet,
3380 bool isCallerStructRet,
3381 Type *RetTy,
3382 const SmallVectorImpl<ISD::OutputArg> &Outs,
3383 const SmallVectorImpl<SDValue> &OutVals,
3384 const SmallVectorImpl<ISD::InputArg> &Ins,
3385 SelectionDAG &DAG) const {
3386 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3387 return false;
3388
3389 // If -tailcallopt is specified, make fastcc functions tail-callable.
3390 const MachineFunction &MF = DAG.getMachineFunction();
3391 const Function *CallerF = MF.getFunction();
3392
3393 // If the function return type is x86_fp80 and the callee return type is not,
3394 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3395 // perform a tailcall optimization here.
3396 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3397 return false;
3398
3399 CallingConv::ID CallerCC = CallerF->getCallingConv();
3400 bool CCMatch = CallerCC == CalleeCC;
3401 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3402 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3403
3404 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3405 if (IsTailCallConvention(CalleeCC) && CCMatch)
3406 return true;
3407 return false;
3408 }
3409
3410 // Look for obvious safe cases to perform tail call optimization that do not
3411 // require ABI changes. This is what gcc calls sibcall.
3412
3413 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3414 // emit a special epilogue.
3415 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3416 DAG.getSubtarget().getRegisterInfo());
3417 if (RegInfo->needsStackRealignment(MF))
3418 return false;
3419
3420 // Also avoid sibcall optimization if either caller or callee uses struct
3421 // return semantics.
3422 if (isCalleeStructRet || isCallerStructRet)
3423 return false;
3424
3425 // An stdcall/thiscall caller is expected to clean up its arguments; the
3426 // callee isn't going to do that.
3427 // FIXME: this is more restrictive than needed. We could produce a tailcall
3428 // when the stack adjustment matches. For example, with a thiscall that takes
3429 // only one argument.
3430 if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3431 CallerCC == CallingConv::X86_ThisCall))
3432 return false;
3433
3434 // Do not sibcall optimize vararg calls unless all arguments are passed via
3435 // registers.
3436 if (isVarArg && !Outs.empty()) {
3437
3438 // Optimizing for varargs on Win64 is unlikely to be safe without
3439 // additional testing.
3440 if (IsCalleeWin64 || IsCallerWin64)
3441 return false;
3442
3443 SmallVector<CCValAssign, 16> ArgLocs;
3444 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3445 *DAG.getContext());
3446
3447 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3448 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3449 if (!ArgLocs[i].isRegLoc())
3450 return false;
3451 }
3452
3453 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3454 // stack. Therefore, if it's not used by the call it is not safe to optimize
3455 // this into a sibcall.
3456 bool Unused = false;
3457 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3458 if (!Ins[i].Used) {
3459 Unused = true;
3460 break;
3461 }
3462 }
3463 if (Unused) {
3464 SmallVector<CCValAssign, 16> RVLocs;
3465 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3466 *DAG.getContext());
3467 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3468 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3469 CCValAssign &VA = RVLocs[i];
3470 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3471 return false;
3472 }
3473 }
3474
3475 // If the calling conventions do not match, then we'd better make sure the
3476 // results are returned in the same way as what the caller expects.
3477 if (!CCMatch) {
3478 SmallVector<CCValAssign, 16> RVLocs1;
3479 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3480 *DAG.getContext());
3481 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3482
3483 SmallVector<CCValAssign, 16> RVLocs2;
3484 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3485 *DAG.getContext());
3486 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3487
3488 if (RVLocs1.size() != RVLocs2.size())
3489 return false;
3490 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3491 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3492 return false;
3493 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3494 return false;
3495 if (RVLocs1[i].isRegLoc()) {
3496 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3497 return false;
3498 } else {
3499 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3500 return false;
3501 }
3502 }
3503 }
3504
3505 // If the callee takes no arguments then go on to check the results of the
3506 // call.
3507 if (!Outs.empty()) {
3508 // Check if stack adjustment is needed. For now, do not do this if any
3509 // argument is passed on the stack.
3510 SmallVector<CCValAssign, 16> ArgLocs;
3511 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3512 *DAG.getContext());
3513
3514 // Allocate shadow area for Win64
3515 if (IsCalleeWin64)
3516 CCInfo.AllocateStack(32, 8);
3517
3518 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3519 if (CCInfo.getNextStackOffset()) {
3520 MachineFunction &MF = DAG.getMachineFunction();
3521 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3522 return false;
3523
3524 // Check if the arguments are already laid out in the right way as
3525 // the caller's fixed stack objects.
3526 MachineFrameInfo *MFI = MF.getFrameInfo();
3527 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3528 const X86InstrInfo *TII =
3529 static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
3530 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3531 CCValAssign &VA = ArgLocs[i];
3532 SDValue Arg = OutVals[i];
3533 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3534 if (VA.getLocInfo() == CCValAssign::Indirect)
3535 return false;
3536 if (!VA.isRegLoc()) {
3537 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3538 MFI, MRI, TII))
3539 return false;
3540 }
3541 }
3542 }
3543
3544 // If the tailcall address may be in a register, then make sure it's
3545 // possible to register allocate for it. In 32-bit, the call address can
3546 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3547 // callee-saved registers are restored. These happen to be the same
3548 // registers used to pass 'inreg' arguments so watch out for those.
3549 if (!Subtarget->is64Bit() &&
3550 ((!isa<GlobalAddressSDNode>(Callee) &&
3551 !isa<ExternalSymbolSDNode>(Callee)) ||
3552 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3553 unsigned NumInRegs = 0;
3554 // In PIC we need an extra register to formulate the address computation
3555 // for the callee.
3556 unsigned MaxInRegs =
3557 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3558
3559 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3560 CCValAssign &VA = ArgLocs[i];
3561 if (!VA.isRegLoc())
3562 continue;
3563 unsigned Reg = VA.getLocReg();
3564 switch (Reg) {
3565 default: break;
3566 case X86::EAX: case X86::EDX: case X86::ECX:
3567 if (++NumInRegs == MaxInRegs)
3568 return false;
3569 break;
3570 }
3571 }
3572 }
3573 }
3574
3575 return true;
3576}
3577
3578FastISel *
3579X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3580 const TargetLibraryInfo *libInfo) const {
3581 return X86::createFastISel(funcInfo, libInfo);
3582}
3583
3584//===----------------------------------------------------------------------===//
3585// Other Lowering Hooks
3586//===----------------------------------------------------------------------===//
3587
3588static bool MayFoldLoad(SDValue Op) {
3589 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3590}
3591
3592static bool MayFoldIntoStore(SDValue Op) {
3593 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3594}
3595
3596static bool isTargetShuffle(unsigned Opcode) {
3597 switch(Opcode) {
3598 default: return false;
3599 case X86ISD::BLENDI:
3600 case X86ISD::PSHUFB:
3601 case X86ISD::PSHUFD:
3602 case X86ISD::PSHUFHW:
3603 case X86ISD::PSHUFLW:
3604 case X86ISD::SHUFP:
3605 case X86ISD::PALIGNR:
3606 case X86ISD::MOVLHPS:
3607 case X86ISD::MOVLHPD:
3608 case X86ISD::MOVHLPS:
3609 case X86ISD::MOVLPS:
3610 case X86ISD::MOVLPD:
3611 case X86ISD::MOVSHDUP:
3612 case X86ISD::MOVSLDUP:
3613 case X86ISD::MOVDDUP:
3614 case X86ISD::MOVSS:
3615 case X86ISD::MOVSD:
3616 case X86ISD::UNPCKL:
3617 case X86ISD::UNPCKH:
3618 case X86ISD::VPERMILPI:
3619 case X86ISD::VPERM2X128:
3620 case X86ISD::VPERMI:
3621 return true;
3622 }
3623}
3624
3625static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3626 SDValue V1, SelectionDAG &DAG) {
3627 switch(Opc) {
3628 default: llvm_unreachable("Unknown x86 shuffle node")::llvm::llvm_unreachable_internal("Unknown x86 shuffle node",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3628)
;
3629 case X86ISD::MOVSHDUP:
3630 case X86ISD::MOVSLDUP:
3631 case X86ISD::MOVDDUP:
3632 return DAG.getNode(Opc, dl, VT, V1);
3633 }
3634}
3635
3636static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3637 SDValue V1, unsigned TargetMask,
3638 SelectionDAG &DAG) {
3639 switch(Opc) {
3640 default: llvm_unreachable("Unknown x86 shuffle node")::llvm::llvm_unreachable_internal("Unknown x86 shuffle node",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3640)
;
3641 case X86ISD::PSHUFD:
3642 case X86ISD::PSHUFHW:
3643 case X86ISD::PSHUFLW:
3644 case X86ISD::VPERMILPI:
3645 case X86ISD::VPERMI:
3646 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3647 }
3648}
3649
3650static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3651 SDValue V1, SDValue V2, unsigned TargetMask,
3652 SelectionDAG &DAG) {
3653 switch(Opc) {
3654 default: llvm_unreachable("Unknown x86 shuffle node")::llvm::llvm_unreachable_internal("Unknown x86 shuffle node",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3654)
;
3655 case X86ISD::PALIGNR:
3656 case X86ISD::VALIGN:
3657 case X86ISD::SHUFP:
3658 case X86ISD::VPERM2X128:
3659 return DAG.getNode(Opc, dl, VT, V1, V2,
3660 DAG.getConstant(TargetMask, MVT::i8));
3661 }
3662}
3663
3664static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3665 SDValue V1, SDValue V2, SelectionDAG &DAG) {
3666 switch(Opc) {
3667 default: llvm_unreachable("Unknown x86 shuffle node")::llvm::llvm_unreachable_internal("Unknown x86 shuffle node",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3667)
;
3668 case X86ISD::MOVLHPS:
3669 case X86ISD::MOVLHPD:
3670 case X86ISD::MOVHLPS:
3671 case X86ISD::MOVLPS:
3672 case X86ISD::MOVLPD:
3673 case X86ISD::MOVSS:
3674 case X86ISD::MOVSD:
3675 case X86ISD::UNPCKL:
3676 case X86ISD::UNPCKH:
3677 return DAG.getNode(Opc, dl, VT, V1, V2);
3678 }
3679}
3680
3681SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3682 MachineFunction &MF = DAG.getMachineFunction();
3683 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3684 DAG.getSubtarget().getRegisterInfo());
3685 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3686 int ReturnAddrIndex = FuncInfo->getRAIndex();
3687
3688 if (ReturnAddrIndex == 0) {
3689 // Set up a frame object for the return address.
3690 unsigned SlotSize = RegInfo->getSlotSize();
3691 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3692 -(int64_t)SlotSize,
3693 false);
3694 FuncInfo->setRAIndex(ReturnAddrIndex);
3695 }
3696
3697 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3698}
3699
3700bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3701 bool hasSymbolicDisplacement) {
3702 // Offset should fit into 32 bit immediate field.
3703 if (!isInt<32>(Offset))
3704 return false;
3705
3706 // If we don't have a symbolic displacement - we don't have any extra
3707 // restrictions.
3708 if (!hasSymbolicDisplacement)
3709 return true;
3710
3711 // FIXME: Some tweaks might be needed for medium code model.
3712 if (M != CodeModel::Small && M != CodeModel::Kernel)
3713 return false;
3714
3715 // For small code model we assume that latest object is 16MB before end of 31
3716 // bits boundary. We may also accept pretty large negative constants knowing
3717 // that all objects are in the positive half of address space.
3718 if (M == CodeModel::Small && Offset < 16*1024*1024)
3719 return true;
3720
3721 // For kernel code model we know that all object resist in the negative half
3722 // of 32bits address space. We may not accept negative offsets, since they may
3723 // be just off and we may accept pretty large positive ones.
3724 if (M == CodeModel::Kernel && Offset >= 0)
3725 return true;
3726
3727 return false;
3728}
3729
3730/// isCalleePop - Determines whether the callee is required to pop its
3731/// own arguments. Callee pop is necessary to support tail calls.
3732bool X86::isCalleePop(CallingConv::ID CallingConv,
3733 bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3734 switch (CallingConv) {
3735 default:
3736 return false;
3737 case CallingConv::X86_StdCall:
3738 case CallingConv::X86_FastCall:
3739 case CallingConv::X86_ThisCall:
3740 return !is64Bit;
3741 case CallingConv::Fast:
3742 case CallingConv::GHC:
3743 case CallingConv::HiPE:
3744 if (IsVarArg)
3745 return false;
3746 return TailCallOpt;
3747 }
3748}
3749
3750/// \brief Return true if the condition is an unsigned comparison operation.
3751static bool isX86CCUnsigned(unsigned X86CC) {
3752 switch (X86CC) {
3753 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3753)
;
3754 case X86::COND_E: return true;
3755 case X86::COND_G: return false;
3756 case X86::COND_GE: return false;
3757 case X86::COND_L: return false;
3758 case X86::COND_LE: return false;
3759 case X86::COND_NE: return true;
3760 case X86::COND_B: return true;
3761 case X86::COND_A: return true;
3762 case X86::COND_BE: return true;
3763 case X86::COND_AE: return true;
3764 }
3765 llvm_unreachable("covered switch fell through?!")::llvm::llvm_unreachable_internal("covered switch fell through?!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3765)
;
3766}
3767
3768/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3769/// specific condition code, returning the condition code and the LHS/RHS of the
3770/// comparison to make.
3771static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3772 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3773 if (!isFP) {
3774 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3775 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3776 // X > -1 -> X == 0, jump !sign.
3777 RHS = DAG.getConstant(0, RHS.getValueType());
3778 return X86::COND_NS;
3779 }
3780 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3781 // X < 0 -> X == 0, jump on sign.
3782 return X86::COND_S;
3783 }
3784 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3785 // X < 1 -> X <= 0
3786 RHS = DAG.getConstant(0, RHS.getValueType());
3787 return X86::COND_LE;
3788 }
3789 }
3790
3791 switch (SetCCOpcode) {
3792 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3792)
;
3793 case ISD::SETEQ: return X86::COND_E;
3794 case ISD::SETGT: return X86::COND_G;
3795 case ISD::SETGE: return X86::COND_GE;
3796 case ISD::SETLT: return X86::COND_L;
3797 case ISD::SETLE: return X86::COND_LE;
3798 case ISD::SETNE: return X86::COND_NE;
3799 case ISD::SETULT: return X86::COND_B;
3800 case ISD::SETUGT: return X86::COND_A;
3801 case ISD::SETULE: return X86::COND_BE;
3802 case ISD::SETUGE: return X86::COND_AE;
3803 }
3804 }
3805
3806 // First determine if it is required or is profitable to flip the operands.
3807
3808 // If LHS is a foldable load, but RHS is not, flip the condition.
3809 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3810 !ISD::isNON_EXTLoad(RHS.getNode())) {
3811 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3812 std::swap(LHS, RHS);
3813 }
3814
3815 switch (SetCCOpcode) {
3816 default: break;
3817 case ISD::SETOLT:
3818 case ISD::SETOLE:
3819 case ISD::SETUGT:
3820 case ISD::SETUGE:
3821 std::swap(LHS, RHS);
3822 break;
3823 }
3824
3825 // On a floating point condition, the flags are set as follows:
3826 // ZF PF CF op
3827 // 0 | 0 | 0 | X > Y
3828 // 0 | 0 | 1 | X < Y
3829 // 1 | 0 | 0 | X == Y
3830 // 1 | 1 | 1 | unordered
3831 switch (SetCCOpcode) {
3832 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3832)
;
3833 case ISD::SETUEQ:
3834 case ISD::SETEQ: return X86::COND_E;
3835 case ISD::SETOLT: // flipped
3836 case ISD::SETOGT:
3837 case ISD::SETGT: return X86::COND_A;
3838 case ISD::SETOLE: // flipped
3839 case ISD::SETOGE:
3840 case ISD::SETGE: return X86::COND_AE;
3841 case ISD::SETUGT: // flipped
3842 case ISD::SETULT:
3843 case ISD::SETLT: return X86::COND_B;
3844 case ISD::SETUGE: // flipped
3845 case ISD::SETULE:
3846 case ISD::SETLE: return X86::COND_BE;
3847 case ISD::SETONE:
3848 case ISD::SETNE: return X86::COND_NE;
3849 case ISD::SETUO: return X86::COND_P;
3850 case ISD::SETO: return X86::COND_NP;
3851 case ISD::SETOEQ:
3852 case ISD::SETUNE: return X86::COND_INVALID;
3853 }
3854}
3855
3856/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3857/// code. Current x86 isa includes the following FP cmov instructions:
3858/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3859static bool hasFPCMov(unsigned X86CC) {
3860 switch (X86CC) {
3861 default:
3862 return false;
3863 case X86::COND_B:
3864 case X86::COND_BE:
3865 case X86::COND_E:
3866 case X86::COND_P:
3867 case X86::COND_A:
3868 case X86::COND_AE:
3869 case X86::COND_NE:
3870 case X86::COND_NP:
3871 return true;
3872 }
3873}
3874
3875/// isFPImmLegal - Returns true if the target can instruction select the
3876/// specified FP immediate natively. If false, the legalizer will
3877/// materialize the FP immediate as a load from a constant pool.
3878bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3879 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3880 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3881 return true;
3882 }
3883 return false;
3884}
3885
3886bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3887 ISD::LoadExtType ExtTy,
3888 EVT NewVT) const {
3889 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3890 // relocation target a movq or addq instruction: don't let the load shrink.
3891 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3892 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3893 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3894 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3895 return true;
3896}
3897
3898/// \brief Returns true if it is beneficial to convert a load of a constant
3899/// to just the constant itself.
3900bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3901 Type *Ty) const {
3902 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __PRETTY_FUNCTION__))
;
3903
3904 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3905 if (BitSize == 0 || BitSize > 64)
3906 return false;
3907 return true;
3908}
3909
3910bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3911 unsigned Index) const {
3912 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3913 return false;
3914
3915 return (Index == 0 || Index == ResVT.getVectorNumElements());
3916}
3917
3918bool X86TargetLowering::isCheapToSpeculateCttz() const {
3919 // Speculate cttz only if we can directly use TZCNT.
3920 return Subtarget->hasBMI();
3921}
3922
3923bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3924 // Speculate ctlz only if we can directly use LZCNT.
3925 return Subtarget->hasLZCNT();
3926}
3927
3928/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3929/// the specified range (L, H].
3930static bool isUndefOrInRange(int Val, int Low, int Hi) {
3931 return (Val < 0) || (Val >= Low && Val < Hi);
3932}
3933
3934/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3935/// specified value.
3936static bool isUndefOrEqual(int Val, int CmpVal) {
3937 return (Val < 0 || Val == CmpVal);
3938}
3939
3940/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3941/// from position Pos and ending in Pos+Size, falls within the specified
3942/// sequential range (Low, Low+Size]. or is undef.
3943static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3944 unsigned Pos, unsigned Size, int Low) {
3945 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3946 if (!isUndefOrEqual(Mask[i], Low))
3947 return false;
3948 return true;
3949}
3950
3951/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3952/// is suitable for input to PSHUFD. That is, it doesn't reference the other
3953/// operand - by default will match for first operand.
3954static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3955 bool TestSecondOperand = false) {
3956 if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3957 VT != MVT::v2f64 && VT != MVT::v2i64)
3958 return false;
3959
3960 unsigned NumElems = VT.getVectorNumElements();
3961 unsigned Lo = TestSecondOperand ? NumElems : 0;
3962 unsigned Hi = Lo + NumElems;
3963
3964 for (unsigned i = 0; i < NumElems; ++i)
3965 if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3966 return false;
3967
3968 return true;
3969}
3970
3971/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3972/// is suitable for input to PSHUFHW.
3973static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3974 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3975 return false;
3976
3977 // Lower quadword copied in order or undef.
3978 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3979 return false;
3980
3981 // Upper quadword shuffled.
3982 for (unsigned i = 4; i != 8; ++i)
3983 if (!isUndefOrInRange(Mask[i], 4, 8))
3984 return false;
3985
3986 if (VT == MVT::v16i16) {
3987 // Lower quadword copied in order or undef.
3988 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3989 return false;
3990
3991 // Upper quadword shuffled.
3992 for (unsigned i = 12; i != 16; ++i)
3993 if (!isUndefOrInRange(Mask[i], 12, 16))
3994 return false;
3995 }
3996
3997 return true;
3998}
3999
4000/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
4001/// is suitable for input to PSHUFLW.
4002static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4003 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
4004 return false;
4005
4006 // Upper quadword copied in order.
4007 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4008 return false;
4009
4010 // Lower quadword shuffled.
4011 for (unsigned i = 0; i != 4; ++i)
4012 if (!isUndefOrInRange(Mask[i], 0, 4))
4013 return false;
4014
4015 if (VT == MVT::v16i16) {
4016 // Upper quadword copied in order.
4017 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4018 return false;
4019
4020 // Lower quadword shuffled.
4021 for (unsigned i = 8; i != 12; ++i)
4022 if (!isUndefOrInRange(Mask[i], 8, 12))
4023 return false;
4024 }
4025
4026 return true;
4027}
4028
4029/// \brief Return true if the mask specifies a shuffle of elements that is
4030/// suitable for input to intralane (palignr) or interlane (valign) vector
4031/// right-shift.
4032static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4033 unsigned NumElts = VT.getVectorNumElements();
4034 unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4035 unsigned NumLaneElts = NumElts/NumLanes;
4036
4037 // Do not handle 64-bit element shuffles with palignr.
4038 if (NumLaneElts == 2)
4039 return false;
4040
4041 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4042 unsigned i;
4043 for (i = 0; i != NumLaneElts; ++i) {
4044 if (Mask[i+l] >= 0)
4045 break;
4046 }
4047
4048 // Lane is all undef, go to next lane
4049 if (i == NumLaneElts)
4050 continue;
4051
4052 int Start = Mask[i+l];
4053
4054 // Make sure its in this lane in one of the sources
4055 if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4056 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4057 return false;
4058
4059 // If not lane 0, then we must match lane 0
4060 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4061 return false;
4062
4063 // Correct second source to be contiguous with first source
4064 if (Start >= (int)NumElts)
4065 Start -= NumElts - NumLaneElts;
4066
4067 // Make sure we're shifting in the right direction.
4068 if (Start <= (int)(i+l))
4069 return false;
4070
4071 Start -= i;
4072
4073 // Check the rest of the elements to see if they are consecutive.
4074 for (++i; i != NumLaneElts; ++i) {
4075 int Idx = Mask[i+l];
4076
4077 // Make sure its in this lane
4078 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4079 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4080 return false;
4081
4082 // If not lane 0, then we must match lane 0
4083 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4084 return false;
4085
4086 if (Idx >= (int)NumElts)
4087 Idx -= NumElts - NumLaneElts;
4088
4089 if (!isUndefOrEqual(Idx, Start+i))
4090 return false;
4091
4092 }
4093 }
4094
4095 return true;
4096}
4097
4098/// \brief Return true if the node specifies a shuffle of elements that is
4099/// suitable for input to PALIGNR.
4100static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4101 const X86Subtarget *Subtarget) {
4102 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4103 (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4104 VT.is512BitVector())
4105 // FIXME: Add AVX512BW.
4106 return false;
4107
4108 return isAlignrMask(Mask, VT, false);
4109}
4110
4111/// \brief Return true if the node specifies a shuffle of elements that is
4112/// suitable for input to VALIGN.
4113static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4114 const X86Subtarget *Subtarget) {
4115 // FIXME: Add AVX512VL.
4116 if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4117 return false;
4118 return isAlignrMask(Mask, VT, true);
4119}
4120
4121/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4122/// the two vector operands have swapped position.
4123static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4124 unsigned NumElems) {
4125 for (unsigned i = 0; i != NumElems; ++i) {
4126 int idx = Mask[i];
4127 if (idx < 0)
4128 continue;
4129 else if (idx < (int)NumElems)
4130 Mask[i] = idx + NumElems;
4131 else
4132 Mask[i] = idx - NumElems;
4133 }
4134}
4135
4136/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4137/// specifies a shuffle of elements that is suitable for input to 128/256-bit
4138/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4139/// reverse of what x86 shuffles want.
4140static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4141
4142 unsigned NumElems = VT.getVectorNumElements();
4143 unsigned NumLanes = VT.getSizeInBits()/128;
4144 unsigned NumLaneElems = NumElems/NumLanes;
4145
4146 if (NumLaneElems != 2 && NumLaneElems != 4)
4147 return false;
4148
4149 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4150 bool symetricMaskRequired =
4151 (VT.getSizeInBits() >= 256) && (EltSize == 32);
4152
4153 // VSHUFPSY divides the resulting vector into 4 chunks.
4154 // The sources are also splitted into 4 chunks, and each destination
4155 // chunk must come from a different source chunk.
4156 //
4157 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
4158 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
4159 //
4160 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
4161 // Y3..Y0, Y3..Y0, X3..X0, X3..X0
4162 //
4163 // VSHUFPDY divides the resulting vector into 4 chunks.
4164 // The sources are also splitted into 4 chunks, and each destination
4165 // chunk must come from a different source chunk.
4166 //
4167 // SRC1 => X3 X2 X1 X0
4168 // SRC2 => Y3 Y2 Y1 Y0
4169 //
4170 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
4171 //
4172 SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4173 unsigned HalfLaneElems = NumLaneElems/2;
4174 for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4175 for (unsigned i = 0; i != NumLaneElems; ++i) {
4176 int Idx = Mask[i+l];
4177 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4178 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4179 return false;
4180 // For VSHUFPSY, the mask of the second half must be the same as the
4181 // first but with the appropriate offsets. This works in the same way as
4182 // VPERMILPS works with masks.
4183 if (!symetricMaskRequired || Idx < 0)
4184 continue;
4185 if (MaskVal[i] < 0) {
4186 MaskVal[i] = Idx - l;
4187 continue;
4188 }
4189 if ((signed)(Idx - l) != MaskVal[i])
4190 return false;
4191 }
4192 }
4193
4194 return true;
4195}
4196
4197/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4198/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4199static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4200 if (!VT.is128BitVector())
4201 return false;
4202
4203 unsigned NumElems = VT.getVectorNumElements();
4204
4205 if (NumElems != 4)
4206 return false;
4207
4208 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4209 return isUndefOrEqual(Mask[0], 6) &&
4210 isUndefOrEqual(Mask[1], 7) &&
4211 isUndefOrEqual(Mask[2], 2) &&
4212 isUndefOrEqual(Mask[3], 3);
4213}
4214
4215/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4216/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4217/// <2, 3, 2, 3>
4218static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4219 if (!VT.is128BitVector())
4220 return false;
4221
4222 unsigned NumElems = VT.getVectorNumElements();
4223
4224 if (NumElems != 4)
4225 return false;
4226
4227 return isUndefOrEqual(Mask[0], 2) &&
4228 isUndefOrEqual(Mask[1], 3) &&
4229 isUndefOrEqual(Mask[2], 2) &&
4230 isUndefOrEqual(Mask[3], 3);
4231}
4232
4233/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4234/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4235static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4236 if (!VT.is128BitVector())
4237 return false;
4238
4239 unsigned NumElems = VT.getVectorNumElements();
4240
4241 if (NumElems != 2 && NumElems != 4)
4242 return false;
4243
4244 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4245 if (!isUndefOrEqual(Mask[i], i + NumElems))
4246 return false;
4247
4248 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4249 if (!isUndefOrEqual(Mask[i], i))
4250 return false;
4251
4252 return true;
4253}
4254
4255/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4256/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4257static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4258 if (!VT.is128BitVector())
4259 return false;
4260
4261 unsigned NumElems = VT.getVectorNumElements();
4262
4263 if (NumElems != 2 && NumElems != 4)
4264 return false;
4265
4266 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4267 if (!isUndefOrEqual(Mask[i], i))
4268 return false;
4269
4270 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4271 if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4272 return false;
4273
4274 return true;
4275}
4276
4277/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4278/// specifies a shuffle of elements that is suitable for input to INSERTPS.
4279/// i. e: If all but one element come from the same vector.
4280static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4281 // TODO: Deal with AVX's VINSERTPS
4282 if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4283 return false;
4284
4285 unsigned CorrectPosV1 = 0;
4286 unsigned CorrectPosV2 = 0;
4287 for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4288 if (Mask[i] == -1) {
4289 ++CorrectPosV1;
4290 ++CorrectPosV2;
4291 continue;
4292 }
4293
4294 if (Mask[i] == i)
4295 ++CorrectPosV1;
4296 else if (Mask[i] == i + 4)
4297 ++CorrectPosV2;
4298 }
4299
4300 if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4301 // We have 3 elements (undefs count as elements from any vector) from one
4302 // vector, and one from another.
4303 return true;
4304
4305 return false;
4306}
4307
4308//
4309// Some special combinations that can be optimized.
4310//
4311static
4312SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4313 SelectionDAG &DAG) {
4314 MVT VT = SVOp->getSimpleValueType(0);
4315 SDLoc dl(SVOp);
4316
4317 if (VT != MVT::v8i32 && VT != MVT::v8f32)
4318 return SDValue();
4319
4320 ArrayRef<int> Mask = SVOp->getMask();
4321
4322 // These are the special masks that may be optimized.
4323 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4324 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
4325 bool MatchEvenMask = true;
4326 bool MatchOddMask = true;
4327 for (int i=0; i<8; ++i) {
4328 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4329 MatchEvenMask = false;
4330 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4331 MatchOddMask = false;
4332 }
4333
4334 if (!MatchEvenMask && !MatchOddMask)
4335 return SDValue();
4336
4337 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4338
4339 SDValue Op0 = SVOp->getOperand(0);
4340 SDValue Op1 = SVOp->getOperand(1);
4341
4342 if (MatchEvenMask) {
4343 // Shift the second operand right to 32 bits.
4344 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4345 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4346 } else {
4347 // Shift the first operand left to 32 bits.
4348 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4349 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4350 }
4351 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4352 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4353}
4354
4355/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4356/// specifies a shuffle of elements that is suitable for input to UNPCKL.
4357static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4358 bool HasInt256, bool V2IsSplat = false) {
4359
4360 assert(VT.getSizeInBits() >= 128 &&((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckl\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4361, __PRETTY_FUNCTION__))
4361 "Unsupported vector type for unpckl")((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckl\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4361, __PRETTY_FUNCTION__))
;
4362
4363 unsigned NumElts = VT.getVectorNumElements();
4364 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4365 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4366 return false;
4367
4368 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4369, __PRETTY_FUNCTION__))
4369 "Unsupported vector type for unpckh")(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4369, __PRETTY_FUNCTION__))
;
4370
4371 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4372 unsigned NumLanes = VT.getSizeInBits()/128;
4373 unsigned NumLaneElts = NumElts/NumLanes;
4374
4375 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4376 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4377 int BitI = Mask[l+i];
4378 int BitI1 = Mask[l+i+1];
4379 if (!isUndefOrEqual(BitI, j))
4380 return false;
4381 if (V2IsSplat) {
4382 if (!isUndefOrEqual(BitI1, NumElts))
4383 return false;
4384 } else {
4385 if (!isUndefOrEqual(BitI1, j + NumElts))
4386 return false;
4387 }
4388 }
4389 }
4390
4391 return true;
4392}
4393
4394/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4395/// specifies a shuffle of elements that is suitable for input to UNPCKH.
4396static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4397 bool HasInt256, bool V2IsSplat = false) {
4398 assert(VT.getSizeInBits() >= 128 &&((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4399, __PRETTY_FUNCTION__))
4399 "Unsupported vector type for unpckh")((VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 128 && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4399, __PRETTY_FUNCTION__))
;
4400
4401 unsigned NumElts = VT.getVectorNumElements();
4402 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4403 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4404 return false;
4405
4406 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4407, __PRETTY_FUNCTION__))
4407 "Unsupported vector type for unpckh")(((!VT.is512BitVector() || VT.getScalarType().getSizeInBits()
>= 32) && "Unsupported vector type for unpckh") ?
static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4407, __PRETTY_FUNCTION__))
;
4408
4409 // AVX defines UNPCK* to operate independently on 128-bit lanes.
4410 unsigned NumLanes = VT.getSizeInBits()/128;
4411 unsigned NumLaneElts = NumElts/NumLanes;
4412
4413 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4414 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4415 int BitI = Mask[l+i];
4416 int BitI1 = Mask[l+i+1];
4417 if (!isUndefOrEqual(BitI, j))
4418 return false;
4419 if (V2IsSplat) {
4420 if (isUndefOrEqual(BitI1, NumElts))
4421 return false;
4422 } else {
4423 if (!isUndefOrEqual(BitI1, j+NumElts))
4424 return false;
4425 }
4426 }
4427 }
4428 return true;
4429}
4430
4431/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4432/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4433/// <0, 0, 1, 1>
4434static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4435 unsigned NumElts = VT.getVectorNumElements();
4436 bool Is256BitVec = VT.is256BitVector();
4437
4438 if (VT.is512BitVector())
4439 return false;
4440 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4441, __PRETTY_FUNCTION__))
4441 "Unsupported vector type for unpckh")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4441, __PRETTY_FUNCTION__))
;
4442
4443 if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4444 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4445 return false;
4446
4447 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4448 // FIXME: Need a better way to get rid of this, there's no latency difference
4449 // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4450 // the former later. We should also remove the "_undef" special mask.
4451 if (NumElts == 4 && Is256BitVec)
4452 return false;
4453
4454 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4455 // independently on 128-bit lanes.
4456 unsigned NumLanes = VT.getSizeInBits()/128;
4457 unsigned NumLaneElts = NumElts/NumLanes;
4458
4459 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4460 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4461 int BitI = Mask[l+i];
4462 int BitI1 = Mask[l+i+1];
4463
4464 if (!isUndefOrEqual(BitI, j))
4465 return false;
4466 if (!isUndefOrEqual(BitI1, j))
4467 return false;
4468 }
4469 }
4470
4471 return true;
4472}
4473
4474/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4475/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4476/// <2, 2, 3, 3>
4477static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4478 unsigned NumElts = VT.getVectorNumElements();
4479
4480 if (VT.is512BitVector())
4481 return false;
4482
4483 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4484, __PRETTY_FUNCTION__))
4484 "Unsupported vector type for unpckh")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for unpckh\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4484, __PRETTY_FUNCTION__))
;
4485
4486 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4487 (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4488 return false;
4489
4490 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4491 // independently on 128-bit lanes.
4492 unsigned NumLanes = VT.getSizeInBits()/128;
4493 unsigned NumLaneElts = NumElts/NumLanes;
4494
4495 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4496 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4497 int BitI = Mask[l+i];
4498 int BitI1 = Mask[l+i+1];
4499 if (!isUndefOrEqual(BitI, j))
4500 return false;
4501 if (!isUndefOrEqual(BitI1, j))
4502 return false;
4503 }
4504 }
4505 return true;
4506}
4507
4508// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4509// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4510static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4511 if (!VT.is512BitVector())
4512 return false;
4513
4514 unsigned NumElts = VT.getVectorNumElements();
4515 unsigned HalfSize = NumElts/2;
4516 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4517 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4518 *Imm = 1;
4519 return true;
4520 }
4521 }
4522 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4523 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4524 *Imm = 0;
4525 return true;
4526 }
4527 }
4528 return false;
4529}
4530
4531/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4532/// specifies a shuffle of elements that is suitable for input to MOVSS,
4533/// MOVSD, and MOVD, i.e. setting the lowest element.
4534static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4535 if (VT.getVectorElementType().getSizeInBits() < 32)
4536 return false;
4537 if (!VT.is128BitVector())
4538 return false;
4539
4540 unsigned NumElts = VT.getVectorNumElements();
4541
4542 if (!isUndefOrEqual(Mask[0], NumElts))
4543 return false;
4544
4545 for (unsigned i = 1; i != NumElts; ++i)
4546 if (!isUndefOrEqual(Mask[i], i))
4547 return false;
4548
4549 return true;
4550}
4551
4552/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4553/// as permutations between 128-bit chunks or halves. As an example: this
4554/// shuffle bellow:
4555/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4556/// The first half comes from the second half of V1 and the second half from the
4557/// the second half of V2.
4558static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4559 if (!HasFp256 || !VT.is256BitVector())
4560 return false;
4561
4562 // The shuffle result is divided into half A and half B. In total the two
4563 // sources have 4 halves, namely: C, D, E, F. The final values of A and
4564 // B must come from C, D, E or F.
4565 unsigned HalfSize = VT.getVectorNumElements()/2;
4566 bool MatchA = false, MatchB = false;
4567
4568 // Check if A comes from one of C, D, E, F.
4569 for (unsigned Half = 0; Half != 4; ++Half) {
4570 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4571 MatchA = true;
4572 break;
4573 }
4574 }
4575
4576 // Check if B comes from one of C, D, E, F.
4577 for (unsigned Half = 0; Half != 4; ++Half) {
4578 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4579 MatchB = true;
4580 break;
4581 }
4582 }
4583
4584 return MatchA && MatchB;
4585}
4586
4587/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4588/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4589static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4590 MVT VT = SVOp->getSimpleValueType(0);
4591
4592 unsigned HalfSize = VT.getVectorNumElements()/2;
4593
4594 unsigned FstHalf = 0, SndHalf = 0;
4595 for (unsigned i = 0; i < HalfSize; ++i) {
4596 if (SVOp->getMaskElt(i) > 0) {
4597 FstHalf = SVOp->getMaskElt(i)/HalfSize;
4598 break;
4599 }
4600 }
4601 for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4602 if (SVOp->getMaskElt(i) > 0) {
4603 SndHalf = SVOp->getMaskElt(i)/HalfSize;
4604 break;
4605 }
4606 }
4607
4608 return (FstHalf | (SndHalf << 4));
4609}
4610
4611// Symetric in-lane mask. Each lane has 4 elements (for imm8)
4612static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4613 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4614 if (EltSize < 32)
4615 return false;
4616
4617 unsigned NumElts = VT.getVectorNumElements();
4618 Imm8 = 0;
4619 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4620 for (unsigned i = 0; i != NumElts; ++i) {
4621 if (Mask[i] < 0)
4622 continue;
4623 Imm8 |= Mask[i] << (i*2);
4624 }
4625 return true;
4626 }
4627
4628 unsigned LaneSize = 4;
4629 SmallVector<int, 4> MaskVal(LaneSize, -1);
4630
4631 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4632 for (unsigned i = 0; i != LaneSize; ++i) {
4633 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4634 return false;
4635 if (Mask[i+l] < 0)
4636 continue;
4637 if (MaskVal[i] < 0) {
4638 MaskVal[i] = Mask[i+l] - l;
4639 Imm8 |= MaskVal[i] << (i*2);
4640 continue;
4641 }
4642 if (Mask[i+l] != (signed)(MaskVal[i]+l))
4643 return false;
4644 }
4645 }
4646 return true;
4647}
4648
4649/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4650/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4651/// Note that VPERMIL mask matching is different depending whether theunderlying
4652/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4653/// to the same elements of the low, but to the higher half of the source.
4654/// In VPERMILPD the two lanes could be shuffled independently of each other
4655/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4656static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4657 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4658 if (VT.getSizeInBits() < 256 || EltSize < 32)
4659 return false;
4660 bool symetricMaskRequired = (EltSize == 32);
4661 unsigned NumElts = VT.getVectorNumElements();
4662
4663 unsigned NumLanes = VT.getSizeInBits()/128;
4664 unsigned LaneSize = NumElts/NumLanes;
4665 // 2 or 4 elements in one lane
4666
4667 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4668 for (unsigned l = 0; l != NumElts; l += LaneSize) {
4669 for (unsigned i = 0; i != LaneSize; ++i) {
4670 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4671 return false;
4672 if (symetricMaskRequired) {
4673 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4674 ExpectedMaskVal[i] = Mask[i+l] - l;
4675 continue;
4676 }
4677 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4678 return false;
4679 }
4680 }
4681 }
4682 return true;
4683}
4684
4685/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4686/// of what x86 movss want. X86 movs requires the lowest element to be lowest
4687/// element of vector 2 and the other elements to come from vector 1 in order.
4688static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4689 bool V2IsSplat = false, bool V2IsUndef = false) {
4690 if (!VT.is128BitVector())
4691 return false;
4692
4693 unsigned NumOps = VT.getVectorNumElements();
4694 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4695 return false;
4696
4697 if (!isUndefOrEqual(Mask[0], 0))
4698 return false;
4699
4700 for (unsigned i = 1; i != NumOps; ++i)
4701 if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4702 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4703 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4704 return false;
4705
4706 return true;
4707}
4708
4709/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4710/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4711/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4712static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4713 const X86Subtarget *Subtarget) {
4714 if (!Subtarget->hasSSE3())
4715 return false;
4716
4717 unsigned NumElems = VT.getVectorNumElements();
4718
4719 if ((VT.is128BitVector() && NumElems != 4) ||
4720 (VT.is256BitVector() && NumElems != 8) ||
4721 (VT.is512BitVector() && NumElems != 16))
4722 return false;
4723
4724 // "i+1" is the value the indexed mask element must have
4725 for (unsigned i = 0; i != NumElems; i += 2)
4726 if (!isUndefOrEqual(Mask[i], i+1) ||
4727 !isUndefOrEqual(Mask[i+1], i+1))
4728 return false;
4729
4730 return true;
4731}
4732
4733/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4734/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4735/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4736static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4737 const X86Subtarget *Subtarget) {
4738 if (!Subtarget->hasSSE3())
4739 return false;
4740
4741 unsigned NumElems = VT.getVectorNumElements();
4742
4743 if ((VT.is128BitVector() && NumElems != 4) ||
4744 (VT.is256BitVector() && NumElems != 8) ||
4745 (VT.is512BitVector() && NumElems != 16))
4746 return false;
4747
4748 // "i" is the value the indexed mask element must have
4749 for (unsigned i = 0; i != NumElems; i += 2)
4750 if (!isUndefOrEqual(Mask[i], i) ||
4751 !isUndefOrEqual(Mask[i+1], i))
4752 return false;
4753
4754 return true;
4755}
4756
4757/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4758/// specifies a shuffle of elements that is suitable for input to 256-bit
4759/// version of MOVDDUP.
4760static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4761 if (!HasFp256 || !VT.is256BitVector())
4762 return false;
4763
4764 unsigned NumElts = VT.getVectorNumElements();
4765 if (NumElts != 4)
4766 return false;
4767
4768 for (unsigned i = 0; i != NumElts/2; ++i)
4769 if (!isUndefOrEqual(Mask[i], 0))
4770 return false;
4771 for (unsigned i = NumElts/2; i != NumElts; ++i)
4772 if (!isUndefOrEqual(Mask[i], NumElts/2))
4773 return false;
4774 return true;
4775}
4776
4777/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4778/// specifies a shuffle of elements that is suitable for input to 128-bit
4779/// version of MOVDDUP.
4780static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4781 if (!VT.is128BitVector())
4782 return false;
4783
4784 unsigned e = VT.getVectorNumElements() / 2;
4785 for (unsigned i = 0; i != e; ++i)
4786 if (!isUndefOrEqual(Mask[i], i))
4787 return false;
4788 for (unsigned i = 0; i != e; ++i)
4789 if (!isUndefOrEqual(Mask[e+i], i))
4790 return false;
4791 return true;
4792}
4793
4794/// isVEXTRACTIndex - Return true if the specified
4795/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4796/// suitable for instruction that extract 128 or 256 bit vectors
4797static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4798 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width")(((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unexpected vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4798, __PRETTY_FUNCTION__))
;
4799 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4800 return false;
4801
4802 // The index should be aligned on a vecWidth-bit boundary.
4803 uint64_t Index =
4804 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4805
4806 MVT VT = N->getSimpleValueType(0);
4807 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4808 bool Result = (Index * ElSize) % vecWidth == 0;
4809
4810 return Result;
4811}
4812
4813/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4814/// operand specifies a subvector insert that is suitable for input to
4815/// insertion of 128 or 256-bit subvectors
4816static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4817 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width")(((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unexpected vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4817, __PRETTY_FUNCTION__))
;
4818 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4819 return false;
4820 // The index should be aligned on a vecWidth-bit boundary.
4821 uint64_t Index =
4822 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4823
4824 MVT VT = N->getSimpleValueType(0);
4825 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4826 bool Result = (Index * ElSize) % vecWidth == 0;
4827
4828 return Result;
4829}
4830
4831bool X86::isVINSERT128Index(SDNode *N) {
4832 return isVINSERTIndex(N, 128);
4833}
4834
4835bool X86::isVINSERT256Index(SDNode *N) {
4836 return isVINSERTIndex(N, 256);
4837}
4838
4839bool X86::isVEXTRACT128Index(SDNode *N) {
4840 return isVEXTRACTIndex(N, 128);
4841}
4842
4843bool X86::isVEXTRACT256Index(SDNode *N) {
4844 return isVEXTRACTIndex(N, 256);
4845}
4846
4847/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4848/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4849/// Handles 128-bit and 256-bit.
4850static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4851 MVT VT = N->getSimpleValueType(0);
4852
4853 assert((VT.getSizeInBits() >= 128) &&(((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() >= 128) && \"Unsupported vector type for PSHUF/SHUFP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4854, __PRETTY_FUNCTION__))
4854 "Unsupported vector type for PSHUF/SHUFP")(((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() >= 128) && \"Unsupported vector type for PSHUF/SHUFP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4854, __PRETTY_FUNCTION__))
;
4855
4856 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4857 // independently on 128-bit lanes.
4858 unsigned NumElts = VT.getVectorNumElements();
4859 unsigned NumLanes = VT.getSizeInBits()/128;
4860 unsigned NumLaneElts = NumElts/NumLanes;
4861
4862 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&(((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
"Only supports 2, 4 or 8 elements per lane") ? static_cast<
void> (0) : __assert_fail ("(NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && \"Only supports 2, 4 or 8 elements per lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4863, __PRETTY_FUNCTION__))
4863 "Only supports 2, 4 or 8 elements per lane")(((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
"Only supports 2, 4 or 8 elements per lane") ? static_cast<
void> (0) : __assert_fail ("(NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && \"Only supports 2, 4 or 8 elements per lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4863, __PRETTY_FUNCTION__))
;
4864
4865 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4866 unsigned Mask = 0;
4867 for (unsigned i = 0; i != NumElts; ++i) {
4868 int Elt = N->getMaskElt(i);
4869 if (Elt < 0) continue;
4870 Elt &= NumLaneElts - 1;
4871 unsigned ShAmt = (i << Shift) % 8;
4872 Mask |= Elt << ShAmt;
4873 }
4874
4875 return Mask;
4876}
4877
4878/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4879/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4880static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4881 MVT VT = N->getSimpleValueType(0);
4882
4883 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4884, __PRETTY_FUNCTION__))
4884 "Unsupported vector type for PSHUFHW")(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4884, __PRETTY_FUNCTION__))
;
4885
4886 unsigned NumElts = VT.getVectorNumElements();
4887
4888 unsigned Mask = 0;
4889 for (unsigned l = 0; l != NumElts; l += 8) {
4890 // 8 nodes per lane, but we only care about the last 4.
4891 for (unsigned i = 0; i < 4; ++i) {
4892 int Elt = N->getMaskElt(l+i+4);
4893 if (Elt < 0) continue;
4894 Elt &= 0x3; // only 2-bits.
4895 Mask |= Elt << (i * 2);
4896 }
4897 }
4898
4899 return Mask;
4900}
4901
4902/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4903/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4904static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4905 MVT VT = N->getSimpleValueType(0);
4906
4907 assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4908, __PRETTY_FUNCTION__))
4908 "Unsupported vector type for PSHUFHW")(((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8i16 || VT == MVT::v16i16) && \"Unsupported vector type for PSHUFHW\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4908, __PRETTY_FUNCTION__))
;
4909
4910 unsigned NumElts = VT.getVectorNumElements();
4911
4912 unsigned Mask = 0;
4913 for (unsigned l = 0; l != NumElts; l += 8) {
4914 // 8 nodes per lane, but we only care about the first 4.
4915 for (unsigned i = 0; i < 4; ++i) {
4916 int Elt = N->getMaskElt(l+i);
4917 if (Elt < 0) continue;
4918 Elt &= 0x3; // only 2-bits
4919 Mask |= Elt << (i * 2);
4920 }
4921 }
4922
4923 return Mask;
4924}
4925
4926/// \brief Return the appropriate immediate to shuffle the specified
4927/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4928/// VALIGN (if Interlane is true) instructions.
4929static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4930 bool InterLane) {
4931 MVT VT = SVOp->getSimpleValueType(0);
4932 unsigned EltSize = InterLane ? 1 :
4933 VT.getVectorElementType().getSizeInBits() >> 3;
4934
4935 unsigned NumElts = VT.getVectorNumElements();
4936 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4937 unsigned NumLaneElts = NumElts/NumLanes;
4938
4939 int Val = 0;
4940 unsigned i;
4941 for (i = 0; i != NumElts; ++i) {
4942 Val = SVOp->getMaskElt(i);
4943 if (Val >= 0)
4944 break;
4945 }
4946 if (Val >= (int)NumElts)
4947 Val -= NumElts - NumLaneElts;
4948
4949 assert(Val - i > 0 && "PALIGNR imm should be positive")((Val - i > 0 && "PALIGNR imm should be positive")
? static_cast<void> (0) : __assert_fail ("Val - i > 0 && \"PALIGNR imm should be positive\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __PRETTY_FUNCTION__))
;
4950 return (Val - i) * EltSize;
4951}
4952
4953/// \brief Return the appropriate immediate to shuffle the specified
4954/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4955static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4956 return getShuffleAlignrImmediate(SVOp, false);
4957}
4958
4959/// \brief Return the appropriate immediate to shuffle the specified
4960/// VECTOR_SHUFFLE mask with the VALIGN instruction.
4961static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4962 return getShuffleAlignrImmediate(SVOp, true);
4963}
4964
4965
4966static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4967 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width")(((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4967, __PRETTY_FUNCTION__))
;
4968 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4969 llvm_unreachable("Illegal extract subvector for VEXTRACT")::llvm::llvm_unreachable_internal("Illegal extract subvector for VEXTRACT"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4969)
;
4970
4971 uint64_t Index =
4972 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4973
4974 MVT VecVT = N->getOperand(0).getSimpleValueType();
4975 MVT ElVT = VecVT.getVectorElementType();
4976
4977 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4978 return Index / NumElemsPerChunk;
4979}
4980
4981static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4982 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width")(((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vecWidth == 128 || vecWidth == 256) && \"Unsupported vector width\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4982, __PRETTY_FUNCTION__))
;
4983 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4984 llvm_unreachable("Illegal insert subvector for VINSERT")::llvm::llvm_unreachable_internal("Illegal insert subvector for VINSERT"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 4984)
;
4985
4986 uint64_t Index =
4987 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4988
4989 MVT VecVT = N->getSimpleValueType(0);
4990 MVT ElVT = VecVT.getVectorElementType();
4991
4992 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4993 return Index / NumElemsPerChunk;
4994}
4995
4996/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4997/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4998/// and VINSERTI128 instructions.
4999unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
5000 return getExtractVEXTRACTImmediate(N, 128);
5001}
5002
5003/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
5004/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5005/// and VINSERTI64x4 instructions.
5006unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5007 return getExtractVEXTRACTImmediate(N, 256);
5008}
5009
5010/// getInsertVINSERT128Immediate - Return the appropriate immediate
5011/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5012/// and VINSERTI128 instructions.
5013unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5014 return getInsertVINSERTImmediate(N, 128);
5015}
5016
5017/// getInsertVINSERT256Immediate - Return the appropriate immediate
5018/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5019/// and VINSERTI64x4 instructions.
5020unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5021 return getInsertVINSERTImmediate(N, 256);
5022}
5023
5024/// isZero - Returns true if Elt is a constant integer zero
5025static bool isZero(SDValue V) {
5026 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5027 return C && C->isNullValue();
5028}
5029
5030/// isZeroNode - Returns true if Elt is a constant zero or a floating point
5031/// constant +0.0.
5032bool X86::isZeroNode(SDValue Elt) {
5033 if (isZero(Elt))
5034 return true;
5035 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5036 return CFP->getValueAPF().isPosZero();
5037 return false;
5038}
5039
5040/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5041/// match movhlps. The lower half elements should come from upper half of
5042/// V1 (and in order), and the upper half elements should come from the upper
5043/// half of V2 (and in order).
5044static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5045 if (!VT.is128BitVector())
5046 return false;
5047 if (VT.getVectorNumElements() != 4)
5048 return false;
5049 for (unsigned i = 0, e = 2; i != e; ++i)
5050 if (!isUndefOrEqual(Mask[i], i+2))
5051 return false;
5052 for (unsigned i = 2; i != 4; ++i)
5053 if (!isUndefOrEqual(Mask[i], i+4))
5054 return false;
5055 return true;
5056}
5057
5058/// isScalarLoadToVector - Returns true if the node is a scalar load that
5059/// is promoted to a vector. It also returns the LoadSDNode by reference if
5060/// required.
5061static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5062 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5063 return false;
5064 N = N->getOperand(0).getNode();
5065 if (!ISD::isNON_EXTLoad(N))
5066 return false;
5067 if (LD)
5068 *LD = cast<LoadSDNode>(N);
5069 return true;
5070}
5071
5072// Test whether the given value is a vector value which will be legalized
5073// into a load.
5074static bool WillBeConstantPoolLoad(SDNode *N) {
5075 if (N->getOpcode() != ISD::BUILD_VECTOR)
5076 return false;
5077
5078 // Check for any non-constant elements.
5079 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5080 switch (N->getOperand(i).getNode()->getOpcode()) {
5081 case ISD::UNDEF:
5082 case ISD::ConstantFP:
5083 case ISD::Constant:
5084 break;
5085 default:
5086 return false;
5087 }
5088
5089 // Vectors of all-zeros and all-ones are materialized with special
5090 // instructions rather than being loaded.
5091 return !ISD::isBuildVectorAllZeros(N) &&
5092 !ISD::isBuildVectorAllOnes(N);
5093}
5094
5095/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5096/// match movlp{s|d}. The lower half elements should come from lower half of
5097/// V1 (and in order), and the upper half elements should come from the upper
5098/// half of V2 (and in order). And since V1 will become the source of the
5099/// MOVLP, it must be either a vector load or a scalar load to vector.
5100static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5101 ArrayRef<int> Mask, MVT VT) {
5102 if (!VT.is128BitVector())
5103 return false;
5104
5105 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5106 return false;
5107 // Is V2 is a vector load, don't do this transformation. We will try to use
5108 // load folding shufps op.
5109 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5110 return false;
5111
5112 unsigned NumElems = VT.getVectorNumElements();
5113
5114 if (NumElems != 2 && NumElems != 4)
5115 return false;
5116 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5117 if (!isUndefOrEqual(Mask[i], i))
5118 return false;
5119 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5120 if (!isUndefOrEqual(Mask[i], i+NumElems))
5121 return false;
5122 return true;
5123}
5124
5125/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5126/// to an zero vector.
5127/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5128static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5129 SDValue V1 = N->getOperand(0);
5130 SDValue V2 = N->getOperand(1);
5131 unsigned NumElems = N->getValueType(0).getVectorNumElements();
5132 for (unsigned i = 0; i != NumElems; ++i) {
5133 int Idx = N->getMaskElt(i);
5134 if (Idx >= (int)NumElems) {
5135 unsigned Opc = V2.getOpcode();
5136 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5137 continue;
5138 if (Opc != ISD::BUILD_VECTOR ||
5139 !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5140 return false;
5141 } else if (Idx >= 0) {
5142 unsigned Opc = V1.getOpcode();
5143 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5144 continue;
5145 if (Opc != ISD::BUILD_VECTOR ||
5146 !X86::isZeroNode(V1.getOperand(Idx)))
5147 return false;
5148 }
5149 }
5150 return true;
5151}
5152
5153/// getZeroVector - Returns a vector of specified type with all zero elements.
5154///
5155static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5156 SelectionDAG &DAG, SDLoc dl) {
5157 assert(VT.isVector() && "Expected a vector type")((VT.isVector() && "Expected a vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected a vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5157, __PRETTY_FUNCTION__))
;
5158
5159 // Always build SSE zero vectors as <4 x i32> bitcasted
5160 // to their dest type. This ensures they get CSE'd.
5161 SDValue Vec;
5162 if (VT.is128BitVector()) { // SSE
5163 if (Subtarget->hasSSE2()) { // SSE2
5164 SDValue Cst = DAG.getConstant(0, MVT::i32);
5165 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5166 } else { // SSE1
5167 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5168 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5169 }
5170 } else if (VT.is256BitVector()) { // AVX
5171 if (Subtarget->hasInt256()) { // AVX2
5172 SDValue Cst = DAG.getConstant(0, MVT::i32);
5173 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5174 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5175 } else {
5176 // 256-bit logic and arithmetic instructions in AVX are all
5177 // floating-point, no support for integer ops. Emit fp zeroed vectors.
5178 SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5179 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5180 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5181 }
5182 } else if (VT.is512BitVector()) { // AVX-512
5183 SDValue Cst = DAG.getConstant(0, MVT::i32);
5184 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5185 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5186 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5187 } else if (VT.getScalarType() == MVT::i1) {
5188 assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type")((VT.getVectorNumElements() <= 16 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() <= 16 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5188, __PRETTY_FUNCTION__))
;
5189 SDValue Cst = DAG.getConstant(0, MVT::i1);
5190 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5191 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5192 } else
5193 llvm_unreachable("Unexpected vector type")::llvm::llvm_unreachable_internal("Unexpected vector type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5193)
;
5194
5195 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5196}
5197
5198/// getOnesVector - Returns a vector of specified type with all bits set.
5199/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5200/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5201/// Then bitcast to their original type, ensuring they get CSE'd.
5202static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5203 SDLoc dl) {
5204 assert(VT.isVector() && "Expected a vector type")((VT.isVector() && "Expected a vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected a vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5204, __PRETTY_FUNCTION__))
;
5205
5206 SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5207 SDValue Vec;
5208 if (VT.is256BitVector()) {
5209 if (HasInt256) { // AVX2
5210 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5211 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5212 } else { // AVX
5213 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5214 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5215 }
5216 } else if (VT.is128BitVector()) {
5217 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5218 } else
5219 llvm_unreachable("Unexpected vector type")::llvm::llvm_unreachable_internal("Unexpected vector type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5219)
;
5220
5221 return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5222}
5223
5224/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5225/// that point to V2 points to its first element.
5226static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5227 for (unsigned i = 0; i != NumElems; ++i) {
5228 if (Mask[i] > (int)NumElems) {
5229 Mask[i] = NumElems;
5230 }
5231 }
5232}
5233
5234/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5235/// operation of specified width.
5236static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5237 SDValue V2) {
5238 unsigned NumElems = VT.getVectorNumElements();
5239 SmallVector<int, 8> Mask;
5240 Mask.push_back(NumElems);
5241 for (unsigned i = 1; i != NumElems; ++i)
5242 Mask.push_back(i);
5243 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5244}
5245
5246/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5247static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5248 SDValue V2) {
5249 unsigned NumElems = VT.getVectorNumElements();
5250 SmallVector<int, 8> Mask;
5251 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5252 Mask.push_back(i);
5253 Mask.push_back(i + NumElems);
5254 }
5255 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5256}
5257
5258/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5259static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5260 SDValue V2) {
5261 unsigned NumElems = VT.getVectorNumElements();
5262 SmallVector<int, 8> Mask;
5263 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5264 Mask.push_back(i + Half);
5265 Mask.push_back(i + NumElems + Half);
5266 }
5267 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5268}
5269
5270// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5271// a generic shuffle instruction because the target has no such instructions.
5272// Generate shuffles which repeat i16 and i8 several times until they can be
5273// represented by v4f32 and then be manipulated by target suported shuffles.
5274static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5275 MVT VT = V.getSimpleValueType();
5276 int NumElems = VT.getVectorNumElements();
5277 SDLoc dl(V);
5278
5279 while (NumElems > 4) {
5280 if (EltNo < NumElems/2) {
5281 V = getUnpackl(DAG, dl, VT, V, V);
5282 } else {
5283 V = getUnpackh(DAG, dl, VT, V, V);
5284 EltNo -= NumElems/2;
5285 }
5286 NumElems >>= 1;
5287 }
5288 return V;
5289}
5290
5291/// getLegalSplat - Generate a legal splat with supported x86 shuffles
5292static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5293 MVT VT = V.getSimpleValueType();
5294 SDLoc dl(V);
5295
5296 if (VT.is128BitVector()) {
5297 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5298 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5299 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5300 &SplatMask[0]);
5301 } else if (VT.is256BitVector()) {
5302 // To use VPERMILPS to splat scalars, the second half of indicies must
5303 // refer to the higher part, which is a duplication of the lower one,
5304 // because VPERMILPS can only handle in-lane permutations.
5305 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5306 EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5307
5308 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5309 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5310 &SplatMask[0]);
5311 } else
5312 llvm_unreachable("Vector size not supported")::llvm::llvm_unreachable_internal("Vector size not supported"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5312)
;
5313
5314 return DAG.getNode(ISD::BITCAST, dl, VT, V);
5315}
5316
5317/// PromoteSplat - Splat is promoted to target supported vector shuffles.
5318static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5319 MVT SrcVT = SV->getSimpleValueType(0);
5320 SDValue V1 = SV->getOperand(0);
5321 SDLoc dl(SV);
5322
5323 int EltNo = SV->getSplatIndex();
5324 int NumElems = SrcVT.getVectorNumElements();
5325 bool Is256BitVec = SrcVT.is256BitVector();
5326
5327 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&((((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec
) && "Unknown how to promote splat for type") ? static_cast
<void> (0) : __assert_fail ("((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && \"Unknown how to promote splat for type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5328, __PRETTY_FUNCTION__))
5328 "Unknown how to promote splat for type")((((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec
) && "Unknown how to promote splat for type") ? static_cast
<void> (0) : __assert_fail ("((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && \"Unknown how to promote splat for type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5328, __PRETTY_FUNCTION__))
;
5329
5330 // Extract the 128-bit part containing the splat element and update
5331 // the splat element index when it refers to the higher register.
5332 if (Is256BitVec) {
5333 V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5334 if (EltNo >= NumElems/2)
5335 EltNo -= NumElems/2;
5336 }
5337
5338 // All i16 and i8 vector types can't be used directly by a generic shuffle
5339 // instruction because the target has no such instruction. Generate shuffles
5340 // which repeat i16 and i8 several times until they fit in i32, and then can
5341 // be manipulated by target suported shuffles.
5342 MVT EltVT = SrcVT.getVectorElementType();
5343 if (EltVT == MVT::i8 || EltVT == MVT::i16)
5344 V1 = PromoteSplati8i16(V1, DAG, EltNo);
5345
5346 // Recreate the 256-bit vector and place the same 128-bit vector
5347 // into the low and high part. This is necessary because we want
5348 // to use VPERM* to shuffle the vectors
5349 if (Is256BitVec) {
5350 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5351 }
5352
5353 return getLegalSplat(DAG, V1, EltNo);
5354}
5355
5356/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5357/// vector of zero or undef vector. This produces a shuffle where the low
5358/// element of V2 is swizzled into the zero/undef vector, landing at element
5359/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5360static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5361 bool IsZero,
5362 const X86Subtarget *Subtarget,
5363 SelectionDAG &DAG) {
5364 MVT VT = V2.getSimpleValueType();
5365 SDValue V1 = IsZero
5366 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5367 unsigned NumElems = VT.getVectorNumElements();
5368 SmallVector<int, 16> MaskVec;
5369 for (unsigned i = 0; i != NumElems; ++i)
5370 // If this is the insertion idx, put the low elt of V2 here.
5371 MaskVec.push_back(i == Idx ? NumElems : i);
5372 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5373}
5374
5375/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5376/// target specific opcode. Returns true if the Mask could be calculated. Sets
5377/// IsUnary to true if only uses one source. Note that this will set IsUnary for
5378/// shuffles which use a single input multiple times, and in those cases it will
5379/// adjust the mask to only have indices within that single input.
5380static bool getTargetShuffleMask(SDNode *N, MVT VT,
5381 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5382 unsigned NumElems = VT.getVectorNumElements();
5383 SDValue ImmN;
5384
5385 IsUnary = false;
5386 bool IsFakeUnary = false;
5387 switch(N->getOpcode()) {
5388 case X86ISD::BLENDI:
5389 ImmN = N->getOperand(N->getNumOperands()-1);
5390 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5391 break;
5392 case X86ISD::SHUFP:
5393 ImmN = N->getOperand(N->getNumOperands()-1);
5394 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5395 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5396 break;
5397 case X86ISD::UNPCKH:
5398 DecodeUNPCKHMask(VT, Mask);
5399 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5400 break;
5401 case X86ISD::UNPCKL:
5402 DecodeUNPCKLMask(VT, Mask);
5403 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5404 break;
5405 case X86ISD::MOVHLPS:
5406 DecodeMOVHLPSMask(NumElems, Mask);
5407 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5408 break;
5409 case X86ISD::MOVLHPS:
5410 DecodeMOVLHPSMask(NumElems, Mask);
5411 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5412 break;
5413 case X86ISD::PALIGNR:
5414 ImmN = N->getOperand(N->getNumOperands()-1);
5415 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5416 break;
5417 case X86ISD::PSHUFD:
5418 case X86ISD::VPERMILPI:
5419 ImmN = N->getOperand(N->getNumOperands()-1);
5420 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5421 IsUnary = true;
5422 break;
5423 case X86ISD::PSHUFHW:
5424 ImmN = N->getOperand(N->getNumOperands()-1);
5425 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5426 IsUnary = true;
5427 break;
5428 case X86ISD::PSHUFLW:
5429 ImmN = N->getOperand(N->getNumOperands()-1);
5430 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5431 IsUnary = true;
5432 break;
5433 case X86ISD::PSHUFB: {
5434 IsUnary = true;
5435 SDValue MaskNode = N->getOperand(1);
5436 while (MaskNode->getOpcode() == ISD::BITCAST)
5437 MaskNode = MaskNode->getOperand(0);
5438
5439 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5440 // If we have a build-vector, then things are easy.
5441 EVT VT = MaskNode.getValueType();
5442 assert(VT.isVector() &&((VT.isVector() && "Can't produce a non-vector with a build_vector!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Can't produce a non-vector with a build_vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5443, __PRETTY_FUNCTION__))
5443 "Can't produce a non-vector with a build_vector!")((VT.isVector() && "Can't produce a non-vector with a build_vector!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Can't produce a non-vector with a build_vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5443, __PRETTY_FUNCTION__))
;
5444 if (!VT.isInteger())
5445 return false;
5446
5447 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5448
5449 SmallVector<uint64_t, 32> RawMask;
5450 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5451 SDValue Op = MaskNode->getOperand(i);
5452 if (Op->getOpcode() == ISD::UNDEF) {
5453 RawMask.push_back((uint64_t)SM_SentinelUndef);
5454 continue;
5455 }
5456 auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5457 if (!CN)
5458 return false;
5459 APInt MaskElement = CN->getAPIntValue();
5460
5461 // We now have to decode the element which could be any integer size and
5462 // extract each byte of it.
5463 for (int j = 0; j < NumBytesPerElement; ++j) {
5464 // Note that this is x86 and so always little endian: the low byte is
5465 // the first byte of the mask.
5466 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5467 MaskElement = MaskElement.lshr(8);
5468 }
5469 }
5470 DecodePSHUFBMask(RawMask, Mask);
5471 break;
5472 }
5473
5474 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5475 if (!MaskLoad)
5476 return false;
5477
5478 SDValue Ptr = MaskLoad->getBasePtr();
5479 if (Ptr->getOpcode() == X86ISD::Wrapper)
5480 Ptr = Ptr->getOperand(0);
5481
5482 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5483 if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5484 return false;
5485
5486 if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5487 DecodePSHUFBMask(C, Mask);
5488 break;
5489 }
5490
5491 return false;
5492 }
5493 case X86ISD::VPERMI:
5494 ImmN = N->getOperand(N->getNumOperands()-1);
5495 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5496 IsUnary = true;
5497 break;
5498 case X86ISD::MOVSS:
5499 case X86ISD::MOVSD:
5500 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5501 break;
5502 case X86ISD::VPERM2X128:
5503 ImmN = N->getOperand(N->getNumOperands()-1);
5504 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5505 if (Mask.empty()) return false;
5506 break;
5507 case X86ISD::MOVSLDUP:
5508 DecodeMOVSLDUPMask(VT, Mask);
5509 IsUnary = true;
5510 break;
5511 case X86ISD::MOVSHDUP:
5512 DecodeMOVSHDUPMask(VT, Mask);
5513 IsUnary = true;
5514 break;
5515 case X86ISD::MOVDDUP:
5516 DecodeMOVDDUPMask(VT, Mask);
5517 IsUnary = true;
5518 break;
5519 case X86ISD::MOVLHPD:
5520 case X86ISD::MOVLPD:
5521 case X86ISD::MOVLPS:
5522 // Not yet implemented
5523 return false;
5524 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5524)
;
5525 }
5526
5527 // If we have a fake unary shuffle, the shuffle mask is spread across two
5528 // inputs that are actually the same node. Re-map the mask to always point
5529 // into the first input.
5530 if (IsFakeUnary)
5531 for (int &M : Mask)
5532 if (M >= (int)Mask.size())
5533 M -= Mask.size();
5534
5535 return true;
5536}
5537
5538/// getShuffleScalarElt - Returns the scalar element that will make up the ith
5539/// element of the result of the vector shuffle.
5540static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5541 unsigned Depth) {
5542 if (Depth == 6)
5543 return SDValue(); // Limit search depth.
5544
5545 SDValue V = SDValue(N, 0);
5546 EVT VT = V.getValueType();
5547 unsigned Opcode = V.getOpcode();
5548
5549 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5550 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5551 int Elt = SV->getMaskElt(Index);
5552
5553 if (Elt < 0)
5554 return DAG.getUNDEF(VT.getVectorElementType());
5555
5556 unsigned NumElems = VT.getVectorNumElements();
5557 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5558 : SV->getOperand(1);
5559 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5560 }
5561
5562 // Recurse into target specific vector shuffles to find scalars.
5563 if (isTargetShuffle(Opcode)) {
5564 MVT ShufVT = V.getSimpleValueType();
5565 unsigned NumElems = ShufVT.getVectorNumElements();
5566 SmallVector<int, 16> ShuffleMask;
5567 bool IsUnary;
5568
5569 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5570 return SDValue();
5571
5572 int Elt = ShuffleMask[Index];
5573 if (Elt < 0)
5574 return DAG.getUNDEF(ShufVT.getVectorElementType());
5575
5576 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5577 : N->getOperand(1);
5578 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5579 Depth+1);
5580 }
5581
5582 // Actual nodes that may contain scalar elements
5583 if (Opcode == ISD::BITCAST) {
5584 V = V.getOperand(0);
5585 EVT SrcVT = V.getValueType();
5586 unsigned NumElems = VT.getVectorNumElements();
5587
5588 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5589 return SDValue();
5590 }
5591
5592 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5593 return (Index == 0) ? V.getOperand(0)
5594 : DAG.getUNDEF(VT.getVectorElementType());
5595
5596 if (V.getOpcode() == ISD::BUILD_VECTOR)
5597 return V.getOperand(Index);
5598
5599 return SDValue();
5600}
5601
5602/// getNumOfConsecutiveZeros - Return the number of elements of a vector
5603/// shuffle operation which come from a consecutively from a zero. The
5604/// search can start in two different directions, from left or right.
5605/// We count undefs as zeros until PreferredNum is reached.
5606static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5607 unsigned NumElems, bool ZerosFromLeft,
5608 SelectionDAG &DAG,
5609 unsigned PreferredNum = -1U) {
5610 unsigned NumZeros = 0;
5611 for (unsigned i = 0; i != NumElems; ++i) {
5612 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5613 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5614 if (!Elt.getNode())
5615 break;
5616
5617 if (X86::isZeroNode(Elt))
5618 ++NumZeros;
5619 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5620 NumZeros = std::min(NumZeros + 1, PreferredNum);
5621 else
5622 break;
5623 }
5624
5625 return NumZeros;
5626}
5627
5628/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5629/// correspond consecutively to elements from one of the vector operands,
5630/// starting from its index OpIdx. Also tell OpNum which source vector operand.
5631static
5632bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5633 unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5634 unsigned NumElems, unsigned &OpNum) {
5635 bool SeenV1 = false;
5636 bool SeenV2 = false;
5637
5638 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5639 int Idx = SVOp->getMaskElt(i);
5640 // Ignore undef indicies
5641 if (Idx < 0)
5642 continue;
5643
5644 if (Idx < (int)NumElems)
5645 SeenV1 = true;
5646 else
5647 SeenV2 = true;
5648
5649 // Only accept consecutive elements from the same vector
5650 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5651 return false;
5652 }
5653
5654 OpNum = SeenV1 ? 0 : 1;
5655 return true;
5656}
5657
5658/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5659/// logical left shift of a vector.
5660static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5661 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5662 unsigned NumElems =
5663 SVOp->getSimpleValueType(0).getVectorNumElements();
5664 unsigned NumZeros = getNumOfConsecutiveZeros(
5665 SVOp, NumElems, false /* check zeros from right */, DAG,
5666 SVOp->getMaskElt(0));
5667 unsigned OpSrc;
5668
5669 if (!NumZeros)
5670 return false;
5671
5672 // Considering the elements in the mask that are not consecutive zeros,
5673 // check if they consecutively come from only one of the source vectors.
5674 //
5675 // V1 = {X, A, B, C} 0
5676 // \ \ \ /
5677 // vector_shuffle V1, V2 <1, 2, 3, X>
5678 //
5679 if (!isShuffleMaskConsecutive(SVOp,
5680 0, // Mask Start Index
5681 NumElems-NumZeros, // Mask End Index(exclusive)
5682 NumZeros, // Where to start looking in the src vector
5683 NumElems, // Number of elements in vector
5684 OpSrc)) // Which source operand ?
5685 return false;
5686
5687 isLeft = false;
5688 ShAmt = NumZeros;
5689 ShVal = SVOp->getOperand(OpSrc);
5690 return true;
5691}
5692
5693/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5694/// logical left shift of a vector.
5695static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5696 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5697 unsigned NumElems =
5698 SVOp->getSimpleValueType(0).getVectorNumElements();
5699 unsigned NumZeros = getNumOfConsecutiveZeros(
5700 SVOp, NumElems, true /* check zeros from left */, DAG,
5701 NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5702 unsigned OpSrc;
5703
5704 if (!NumZeros)
5705 return false;
5706
5707 // Considering the elements in the mask that are not consecutive zeros,
5708 // check if they consecutively come from only one of the source vectors.
5709 //
5710 // 0 { A, B, X, X } = V2
5711 // / \ / /
5712 // vector_shuffle V1, V2 <X, X, 4, 5>
5713 //
5714 if (!isShuffleMaskConsecutive(SVOp,
5715 NumZeros, // Mask Start Index
5716 NumElems, // Mask End Index(exclusive)
5717 0, // Where to start looking in the src vector
5718 NumElems, // Number of elements in vector
5719 OpSrc)) // Which source operand ?
5720 return false;
5721
5722 isLeft = true;
5723 ShAmt = NumZeros;
5724 ShVal = SVOp->getOperand(OpSrc);
5725 return true;
5726}
5727
5728/// isVectorShift - Returns true if the shuffle can be implemented as a
5729/// logical left or right shift of a vector.
5730static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5731 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5732 // Although the logic below support any bitwidth size, there are no
5733 // shift instructions which handle more than 128-bit vectors.
5734 if (!SVOp->getSimpleValueType(0).is128BitVector())
5735 return false;
5736
5737 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5738 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5739 return true;
5740
5741 return false;
5742}
5743
5744/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5745///
5746static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5747 unsigned NumNonZero, unsigned NumZero,
5748 SelectionDAG &DAG,
5749 const X86Subtarget* Subtarget,
5750 const TargetLowering &TLI) {
5751 if (NumNonZero > 8)
5752 return SDValue();
5753
5754 SDLoc dl(Op);
5755 SDValue V;
5756 bool First = true;
5757 for (unsigned i = 0; i < 16; ++i) {
5758 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5759 if (ThisIsNonZero && First) {
5760 if (NumZero)
5761 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5762 else
5763 V = DAG.getUNDEF(MVT::v8i16);
5764 First = false;
5765 }
5766
5767 if ((i & 1) != 0) {
5768 SDValue ThisElt, LastElt;
5769 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5770 if (LastIsNonZero) {
5771 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5772 MVT::i16, Op.getOperand(i-1));
5773 }
5774 if (ThisIsNonZero) {
5775 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5776 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5777 ThisElt, DAG.getConstant(8, MVT::i8));
5778 if (LastIsNonZero)
5779 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5780 } else
5781 ThisElt = LastElt;
5782
5783 if (ThisElt.getNode())
5784 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5785 DAG.getIntPtrConstant(i/2));
5786 }
5787 }
5788
5789 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5790}
5791
5792/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5793///
5794static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5795 unsigned NumNonZero, unsigned NumZero,
5796 SelectionDAG &DAG,
5797 const X86Subtarget* Subtarget,
5798 const TargetLowering &TLI) {
5799 if (NumNonZero > 4)
5800 return SDValue();
5801
5802 SDLoc dl(Op);
5803 SDValue V;
5804 bool First = true;
5805 for (unsigned i = 0; i < 8; ++i) {
5806 bool isNonZero = (NonZeros & (1 << i)) != 0;
5807 if (isNonZero) {
5808 if (First) {
5809 if (NumZero)
5810 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5811 else
5812 V = DAG.getUNDEF(MVT::v8i16);
5813 First = false;
5814 }
5815 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5816 MVT::v8i16, V, Op.getOperand(i),
5817 DAG.getIntPtrConstant(i));
5818 }
5819 }
5820
5821 return V;
5822}
5823
5824/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5825static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5826 const X86Subtarget *Subtarget,
5827 const TargetLowering &TLI) {
5828 // Find all zeroable elements.
5829 bool Zeroable[4];
5830 for (int i=0; i < 4; ++i) {
5831 SDValue Elt = Op->getOperand(i);
5832 Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5833 }
5834 assert(std::count_if(&Zeroable[0], &Zeroable[4],((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__))
5835 [](bool M) { return !M; }) > 1 &&((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__))
5836 "We expect at least two non-zero elements!")((std::count_if(&Zeroable[0], &Zeroable[4], [](bool M
) { return !M; }) > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(&Zeroable[0], &Zeroable[4], [](bool M) { return !M; }) > 1 && \"We expect at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5836, __PRETTY_FUNCTION__))
;
5837
5838 // We only know how to deal with build_vector nodes where elements are either
5839 // zeroable or extract_vector_elt with constant index.
5840 SDValue FirstNonZero;
5841 unsigned FirstNonZeroIdx;
5842 for (unsigned i=0; i < 4; ++i) {
5843 if (Zeroable[i])
5844 continue;
5845 SDValue Elt = Op->getOperand(i);
5846 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5847 !isa<ConstantSDNode>(Elt.getOperand(1)))
5848 return SDValue();
5849 // Make sure that this node is extracting from a 128-bit vector.
5850 MVT VT = Elt.getOperand(0).getSimpleValueType();
5851 if (!VT.is128BitVector())
5852 return SDValue();
5853 if (!FirstNonZero.getNode()) {
5854 FirstNonZero = Elt;
5855 FirstNonZeroIdx = i;
5856 }
5857 }
5858
5859 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5859, __PRETTY_FUNCTION__))
;
5860 SDValue V1 = FirstNonZero.getOperand(0);
5861 MVT VT = V1.getSimpleValueType();
5862
5863 // See if this build_vector can be lowered as a blend with zero.
5864 SDValue Elt;
5865 unsigned EltMaskIdx, EltIdx;
5866 int Mask[4];
5867 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5868 if (Zeroable[EltIdx]) {
5869 // The zero vector will be on the right hand side.
5870 Mask[EltIdx] = EltIdx+4;
5871 continue;
5872 }
5873
5874 Elt = Op->getOperand(EltIdx);
5875 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5876 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5877 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5878 break;
5879 Mask[EltIdx] = EltIdx;
5880 }
5881
5882 if (EltIdx == 4) {
5883 // Let the shuffle legalizer deal with blend operations.
5884 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5885 if (V1.getSimpleValueType() != VT)
5886 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5887 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5888 }
5889
5890 // See if we can lower this build_vector to a INSERTPS.
5891 if (!Subtarget->hasSSE41())
5892 return SDValue();
5893
5894 SDValue V2 = Elt.getOperand(0);
5895 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5896 V1 = SDValue();
5897
5898 bool CanFold = true;
5899 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5900 if (Zeroable[i])
5901 continue;
5902
5903 SDValue Current = Op->getOperand(i);
5904 SDValue SrcVector = Current->getOperand(0);
5905 if (!V1.getNode())
5906 V1 = SrcVector;
5907 CanFold = SrcVector == V1 &&
5908 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5909 }
5910
5911 if (!CanFold)
5912 return SDValue();
5913
5914 assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5914, __PRETTY_FUNCTION__))
;
5915 if (V1.getSimpleValueType() != MVT::v4f32)
5916 V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5917 if (V2.getSimpleValueType() != MVT::v4f32)
5918 V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5919
5920 // Ok, we can emit an INSERTPS instruction.
5921 unsigned ZMask = 0;
5922 for (int i = 0; i < 4; ++i)
5923 if (Zeroable[i])
5924 ZMask |= 1 << i;
5925
5926 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5927 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5927, __PRETTY_FUNCTION__))
;
5928 SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5929 DAG.getIntPtrConstant(InsertPSMask));
5930 return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5931}
5932
5933/// Return a vector logical shift node.
5934static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5935 unsigned NumBits, SelectionDAG &DAG,
5936 const TargetLowering &TLI, SDLoc dl) {
5937 assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 5937, __PRETTY_FUNCTION__))
;
5938 MVT ShVT = MVT::v2i64;
5939 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5940 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5941 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5942 SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5943 return DAG.getNode(ISD::BITCAST, dl, VT,
5944 DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5945}
5946
5947static SDValue
5948LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5949
5950 // Check if the scalar load can be widened into a vector load. And if
5951 // the address is "base + cst" see if the cst can be "absorbed" into
5952 // the shuffle mask.
5953 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5954 SDValue Ptr = LD->getBasePtr();
5955 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5956 return SDValue();
5957 EVT PVT = LD->getValueType(0);
5958 if (PVT != MVT::i32 && PVT != MVT::f32)
5959 return SDValue();
5960
5961 int FI = -1;
5962 int64_t Offset = 0;
5963 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5964 FI = FINode->getIndex();
5965 Offset = 0;
5966 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5967 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5968 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5969 Offset = Ptr.getConstantOperandVal(1);
5970 Ptr = Ptr.getOperand(0);
5971 } else {
5972 return SDValue();
5973 }
5974
5975 // FIXME: 256-bit vector instructions don't require a strict alignment,
5976 // improve this code to support it better.
5977 unsigned RequiredAlign = VT.getSizeInBits()/8;
5978 SDValue Chain = LD->getChain();
5979 // Make sure the stack object alignment is at least 16 or 32.
5980 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5981 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5982 if (MFI->isFixedObjectIndex(FI)) {
5983 // Can't change the alignment. FIXME: It's possible to compute
5984 // the exact stack offset and reference FI + adjust offset instead.
5985 // If someone *really* cares about this. That's the way to implement it.
5986 return SDValue();
5987 } else {
5988 MFI->setObjectAlignment(FI, RequiredAlign);
5989 }
5990 }
5991
5992 // (Offset % 16 or 32) must be multiple of 4. Then address is then
5993 // Ptr + (Offset & ~15).
5994 if (Offset < 0)
5995 return SDValue();
5996 if ((Offset % RequiredAlign) & 3)
5997 return SDValue();
5998 int64_t StartOffset = Offset & ~(RequiredAlign-1);
5999 if (StartOffset)
6000 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
6001 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
6002
6003 int EltNo = (Offset - StartOffset) >> 2;
6004 unsigned NumElems = VT.getVectorNumElements();
6005
6006 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6007 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6008 LD->getPointerInfo().getWithOffset(StartOffset),
6009 false, false, false, 0);
6010
6011 SmallVector<int, 8> Mask;
6012 for (unsigned i = 0; i != NumElems; ++i)
6013 Mask.push_back(EltNo);
6014
6015 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6016 }
6017
6018 return SDValue();
6019}
6020
6021/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
6022/// vector of type 'VT', see if the elements can be replaced by a single large
6023/// load which has the same value as a build_vector whose operands are 'elts'.
6024///
6025/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6026///
6027/// FIXME: we'd also like to handle the case where the last elements are zero
6028/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6029/// There's even a handy isZeroNode for that purpose.
6030static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6031 SDLoc &DL, SelectionDAG &DAG,
6032 bool isAfterLegalize) {
6033 EVT EltVT = VT.getVectorElementType();
6034 unsigned NumElems = Elts.size();
6035
6036 LoadSDNode *LDBase = nullptr;
6
'LDBase' initialized to a null pointer value
6037 unsigned LastLoadedElt = -1U;
6038
6039 // For each element in the initializer, see if we've found a load or an undef.
6040 // If we don't find an initial load element, or later load elements are
6041 // non-consecutive, bail out.
6042 for (unsigned i = 0; i < NumElems; ++i) {
7
Assuming 'i' is >= 'NumElems'
8
Loop condition is false. Execution continues on line 6067
6043 SDValue Elt = Elts[i];
6044
6045 if (!Elt.getNode() ||
6046 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6047 return SDValue();
6048 if (!LDBase) {
6049 if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6050 return SDValue();
6051 LDBase = cast<LoadSDNode>(Elt.getNode());
6052 LastLoadedElt = i;
6053 continue;
6054 }
6055 if (Elt.getOpcode() == ISD::UNDEF)
6056 continue;
6057
6058 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6059 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
6060 return SDValue();
6061 LastLoadedElt = i;
6062 }
6063
6064 // If we have found an entire vector of loads and undefs, then return a large
6065 // load of the entire vector width starting at the base pointer. If we found
6066 // consecutive loads for the low half, generate a vzext_load node.
6067 if (LastLoadedElt == NumElems - 1) {
9
Taking true branch
6068
6069 if (isAfterLegalize &&
10
Taking false branch
6070 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6071 return SDValue();
6072
6073 SDValue NewLd = SDValue();
6074
6075 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
11
Called C++ object pointer is null
6076 LDBase->getPointerInfo(), LDBase->isVolatile(),
6077 LDBase->isNonTemporal(), LDBase->isInvariant(),
6078 LDBase->getAlignment());
6079
6080 if (LDBase->hasAnyUseOfValue(1)) {
6081 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6082 SDValue(LDBase, 1),
6083 SDValue(NewLd.getNode(), 1));
6084 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6085 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6086 SDValue(NewLd.getNode(), 1));
6087 }
6088
6089 return NewLd;
6090 }
6091
6092 //TODO: The code below fires only for for loading the low v2i32 / v2f32
6093 //of a v4i32 / v4f32. It's probably worth generalizing.
6094 if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6095 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6096 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6097 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6098 SDValue ResNode =
6099 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6100 LDBase->getPointerInfo(),
6101 LDBase->getAlignment(),
6102 false/*isVolatile*/, true/*ReadMem*/,
6103 false/*WriteMem*/);
6104
6105 // Make sure the newly-created LOAD is in the same position as LDBase in
6106 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6107 // update uses of LDBase's output chain to use the TokenFactor.
6108 if (LDBase->hasAnyUseOfValue(1)) {
6109 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6110 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6111 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6112 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6113 SDValue(ResNode.getNode(), 1));
6114 }
6115
6116 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6117 }
6118 return SDValue();
6119}
6120
6121/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6122/// to generate a splat value for the following cases:
6123/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6124/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6125/// a scalar load, or a constant.
6126/// The VBROADCAST node is returned when a pattern is found,
6127/// or SDValue() otherwise.
6128static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6129 SelectionDAG &DAG) {
6130 // VBROADCAST requires AVX.
6131 // TODO: Splats could be generated for non-AVX CPUs using SSE
6132 // instructions, but there's less potential gain for only 128-bit vectors.
6133 if (!Subtarget->hasAVX())
6134 return SDValue();
6135
6136 MVT VT = Op.getSimpleValueType();
6137 SDLoc dl(Op);
6138
6139 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6140, __PRETTY_FUNCTION__))
6140 "Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6140, __PRETTY_FUNCTION__))
;
6141
6142 SDValue Ld;
6143 bool ConstSplatVal;
6144
6145 switch (Op.getOpcode()) {
6146 default:
6147 // Unknown pattern found.
6148 return SDValue();
6149
6150 case ISD::BUILD_VECTOR: {
6151 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6152 BitVector UndefElements;
6153 SDValue Splat = BVOp->getSplatValue(&UndefElements);
6154
6155 // We need a splat of a single value to use broadcast, and it doesn't
6156 // make any sense if the value is only in one element of the vector.
6157 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6158 return SDValue();
6159
6160 Ld = Splat;
6161 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6162 Ld.getOpcode() == ISD::ConstantFP);
6163
6164 // Make sure that all of the users of a non-constant load are from the
6165 // BUILD_VECTOR node.
6166 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6167 return SDValue();
6168 break;
6169 }
6170
6171 case ISD::VECTOR_SHUFFLE: {
6172 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6173
6174 // Shuffles must have a splat mask where the first element is
6175 // broadcasted.
6176 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6177 return SDValue();
6178
6179 SDValue Sc = Op.getOperand(0);
6180 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6181 Sc.getOpcode() != ISD::BUILD_VECTOR) {
6182
6183 if (!Subtarget->hasInt256())
6184 return SDValue();
6185
6186 // Use the register form of the broadcast instruction available on AVX2.
6187 if (VT.getSizeInBits() >= 256)
6188 Sc = Extract128BitVector(Sc, 0, DAG, dl);
6189 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6190 }
6191
6192 Ld = Sc.getOperand(0);
6193 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6194 Ld.getOpcode() == ISD::ConstantFP);
6195
6196 // The scalar_to_vector node and the suspected
6197 // load node must have exactly one user.
6198 // Constants may have multiple users.
6199
6200 // AVX-512 has register version of the broadcast
6201 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6202 Ld.getValueType().getSizeInBits() >= 32;
6203 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6204 !hasRegVer))
6205 return SDValue();
6206 break;
6207 }
6208 }
6209
6210 unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6211 bool IsGE256 = (VT.getSizeInBits() >= 256);
6212
6213 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6214 // instruction to save 8 or more bytes of constant pool data.
6215 // TODO: If multiple splats are generated to load the same constant,
6216 // it may be detrimental to overall size. There needs to be a way to detect
6217 // that condition to know if this is truly a size win.
6218 const Function *F = DAG.getMachineFunction().getFunction();
6219 bool OptForSize = F->getAttributes().
6220 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6221
6222 // Handle broadcasting a single constant scalar from the constant pool
6223 // into a vector.
6224 // On Sandybridge (no AVX2), it is still better to load a constant vector
6225 // from the constant pool and not to broadcast it from a scalar.
6226 // But override that restriction when optimizing for size.
6227 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6228 if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6229 EVT CVT = Ld.getValueType();
6230 assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6230, __PRETTY_FUNCTION__))
;
6231
6232 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6233 // For size optimization, also splat v2f64 and v2i64, and for size opt
6234 // with AVX2, also splat i8 and i16.
6235 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6236 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6237 (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6238 const Constant *C = nullptr;
6239 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6240 C = CI->getConstantIntValue();
6241 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6242 C = CF->getConstantFPValue();
6243
6244 assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6244, __PRETTY_FUNCTION__))
;
6245
6246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6247 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6248 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6249 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6250 MachinePointerInfo::getConstantPool(),
6251 false, false, false, Alignment);
6252
6253 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6254 }
6255 }
6256
6257 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6258
6259 // Handle AVX2 in-register broadcasts.
6260 if (!IsLoad && Subtarget->hasInt256() &&
6261 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6262 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6263
6264 // The scalar source must be a normal load.
6265 if (!IsLoad)
6266 return SDValue();
6267
6268 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6269 (Subtarget->hasVLX() && ScalarSize == 64))
6270 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6271
6272 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6273 // double since there is no vbroadcastsd xmm
6274 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6275 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6276 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6277 }
6278
6279 // Unsupported broadcast.
6280 return SDValue();
6281}
6282
6283/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6284/// underlying vector and index.
6285///
6286/// Modifies \p ExtractedFromVec to the real vector and returns the real
6287/// index.
6288static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6289 SDValue ExtIdx) {
6290 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6291 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6292 return Idx;
6293
6294 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6295 // lowered this:
6296 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6297 // to:
6298 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6299 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6300 // undef)
6301 // Constant<0>)
6302 // In this case the vector is the extract_subvector expression and the index
6303 // is 2, as specified by the shuffle.
6304 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6305 SDValue ShuffleVec = SVOp->getOperand(0);
6306 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6307 assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6308, __PRETTY_FUNCTION__))
6308 ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6308, __PRETTY_FUNCTION__))
;
6309
6310 int ShuffleIdx = SVOp->getMaskElt(Idx);
6311 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6312 ExtractedFromVec = ShuffleVec;
6313 return ShuffleIdx;
6314 }
6315 return Idx;
6316}
6317
6318static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6319 MVT VT = Op.getSimpleValueType();
6320
6321 // Skip if insert_vec_elt is not supported.
6322 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6323 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6324 return SDValue();
6325
6326 SDLoc DL(Op);
6327 unsigned NumElems = Op.getNumOperands();
6328
6329 SDValue VecIn1;
6330 SDValue VecIn2;
6331 SmallVector<unsigned, 4> InsertIndices;
6332 SmallVector<int, 8> Mask(NumElems, -1);
6333
6334 for (unsigned i = 0; i != NumElems; ++i) {
6335 unsigned Opc = Op.getOperand(i).getOpcode();
6336
6337 if (Opc == ISD::UNDEF)
6338 continue;
6339
6340 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6341 // Quit if more than 1 elements need inserting.
6342 if (InsertIndices.size() > 1)
6343 return SDValue();
6344
6345 InsertIndices.push_back(i);
6346 continue;
6347 }
6348
6349 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6350 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6351 // Quit if non-constant index.
6352 if (!isa<ConstantSDNode>(ExtIdx))
6353 return SDValue();
6354 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6355
6356 // Quit if extracted from vector of different type.
6357 if (ExtractedFromVec.getValueType() != VT)
6358 return SDValue();
6359
6360 if (!VecIn1.getNode())
6361 VecIn1 = ExtractedFromVec;
6362 else if (VecIn1 != ExtractedFromVec) {
6363 if (!VecIn2.getNode())
6364 VecIn2 = ExtractedFromVec;
6365 else if (VecIn2 != ExtractedFromVec)
6366 // Quit if more than 2 vectors to shuffle
6367 return SDValue();
6368 }
6369
6370 if (ExtractedFromVec == VecIn1)
6371 Mask[i] = Idx;
6372 else if (ExtractedFromVec == VecIn2)
6373 Mask[i] = Idx + NumElems;
6374 }
6375
6376 if (!VecIn1.getNode())
6377 return SDValue();
6378
6379 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6380 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6381 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6382 unsigned Idx = InsertIndices[i];
6383 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6384 DAG.getIntPtrConstant(Idx));
6385 }
6386
6387 return NV;
6388}
6389
6390// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6391SDValue
6392X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6393
6394 MVT VT = Op.getSimpleValueType();
6395 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&(((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits
() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6396, __PRETTY_FUNCTION__))
6396 "Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits
() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6396, __PRETTY_FUNCTION__))
;
6397
6398 SDLoc dl(Op);
6399 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6400 SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6401 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6402 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6403 }
6404
6405 if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6406 SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6407 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6408 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6409 }
6410
6411 bool AllContants = true;
6412 uint64_t Immediate = 0;
6413 int NonConstIdx = -1;
6414 bool IsSplat = true;
6415 unsigned NumNonConsts = 0;
6416 unsigned NumConsts = 0;
6417 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6418 SDValue In = Op.getOperand(idx);
6419 if (In.getOpcode() == ISD::UNDEF)
6420 continue;
6421 if (!isa<ConstantSDNode>(In)) {
6422 AllContants = false;
6423 NonConstIdx = idx;
6424 NumNonConsts++;
6425 } else {
6426 NumConsts++;
6427 if (cast<ConstantSDNode>(In)->getZExtValue())
6428 Immediate |= (1ULL << idx);
6429 }
6430 if (In != Op.getOperand(0))
6431 IsSplat = false;
6432 }
6433
6434 if (AllContants) {
6435 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6436 DAG.getConstant(Immediate, MVT::i16));
6437 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6438 DAG.getIntPtrConstant(0));
6439 }
6440
6441 if (NumNonConsts == 1 && NonConstIdx != 0) {
6442 SDValue DstVec;
6443 if (NumConsts) {
6444 SDValue VecAsImm = DAG.getConstant(Immediate,
6445 MVT::getIntegerVT(VT.getSizeInBits()));
6446 DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6447 }
6448 else
6449 DstVec = DAG.getUNDEF(VT);
6450 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6451 Op.getOperand(NonConstIdx),
6452 DAG.getIntPtrConstant(NonConstIdx));
6453 }
6454 if (!IsSplat && (NonConstIdx != 0))
6455 llvm_unreachable("Unsupported BUILD_VECTOR operation")::llvm::llvm_unreachable_internal("Unsupported BUILD_VECTOR operation"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6455)
;
6456 MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6457 SDValue Select;
6458 if (IsSplat)
6459 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6460 DAG.getConstant(-1, SelectVT),
6461 DAG.getConstant(0, SelectVT));
6462 else
6463 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6464 DAG.getConstant((Immediate | 1), SelectVT),
6465 DAG.getConstant(Immediate, SelectVT));
6466 return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6467}
6468
6469/// \brief Return true if \p N implements a horizontal binop and return the
6470/// operands for the horizontal binop into V0 and V1.
6471///
6472/// This is a helper function of PerformBUILD_VECTORCombine.
6473/// This function checks that the build_vector \p N in input implements a
6474/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6475/// operation to match.
6476/// For example, if \p Opcode is equal to ISD::ADD, then this function
6477/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6478/// is equal to ISD::SUB, then this function checks if this is a horizontal
6479/// arithmetic sub.
6480///
6481/// This function only analyzes elements of \p N whose indices are
6482/// in range [BaseIdx, LastIdx).
6483static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6484 SelectionDAG &DAG,
6485 unsigned BaseIdx, unsigned LastIdx,
6486 SDValue &V0, SDValue &V1) {
6487 EVT VT = N->getValueType(0);
6488
6489 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6489, __PRETTY_FUNCTION__))
;
6490 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6491, __PRETTY_FUNCTION__))
6491 "Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6491, __PRETTY_FUNCTION__))
;
6492
6493 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6494 bool CanFold = true;
6495 unsigned ExpectedVExtractIdx = BaseIdx;
6496 unsigned NumElts = LastIdx - BaseIdx;
6497 V0 = DAG.getUNDEF(VT);
6498 V1 = DAG.getUNDEF(VT);
6499
6500 // Check if N implements a horizontal binop.
6501 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6502 SDValue Op = N->getOperand(i + BaseIdx);
6503
6504 // Skip UNDEFs.
6505 if (Op->getOpcode() == ISD::UNDEF) {
6506 // Update the expected vector extract index.
6507 if (i * 2 == NumElts)
6508 ExpectedVExtractIdx = BaseIdx;
6509 ExpectedVExtractIdx += 2;
6510 continue;
6511 }
6512
6513 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6514
6515 if (!CanFold)
6516 break;
6517
6518 SDValue Op0 = Op.getOperand(0);
6519 SDValue Op1 = Op.getOperand(1);
6520
6521 // Try to match the following pattern:
6522 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6523 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6524 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6525 Op0.getOperand(0) == Op1.getOperand(0) &&
6526 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6527 isa<ConstantSDNode>(Op1.getOperand(1)));
6528 if (!CanFold)
6529 break;
6530
6531 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6532 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6533
6534 if (i * 2 < NumElts) {
6535 if (V0.getOpcode() == ISD::UNDEF)
6536 V0 = Op0.getOperand(0);
6537 } else {
6538 if (V1.getOpcode() == ISD::UNDEF)
6539 V1 = Op0.getOperand(0);
6540 if (i * 2 == NumElts)
6541 ExpectedVExtractIdx = BaseIdx;
6542 }
6543
6544 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6545 if (I0 == ExpectedVExtractIdx)
6546 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6547 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6548 // Try to match the following dag sequence:
6549 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6550 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6551 } else
6552 CanFold = false;
6553
6554 ExpectedVExtractIdx += 2;
6555 }
6556
6557 return CanFold;
6558}
6559
6560/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6561/// a concat_vector.
6562///
6563/// This is a helper function of PerformBUILD_VECTORCombine.
6564/// This function expects two 256-bit vectors called V0 and V1.
6565/// At first, each vector is split into two separate 128-bit vectors.
6566/// Then, the resulting 128-bit vectors are used to implement two
6567/// horizontal binary operations.
6568///
6569/// The kind of horizontal binary operation is defined by \p X86Opcode.
6570///
6571/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6572/// the two new horizontal binop.
6573/// When Mode is set, the first horizontal binop dag node would take as input
6574/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6575/// horizontal binop dag node would take as input the lower 128-bit of V1
6576/// and the upper 128-bit of V1.
6577/// Example:
6578/// HADD V0_LO, V0_HI
6579/// HADD V1_LO, V1_HI
6580///
6581/// Otherwise, the first horizontal binop dag node takes as input the lower
6582/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6583/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6584/// Example:
6585/// HADD V0_LO, V1_LO
6586/// HADD V0_HI, V1_HI
6587///
6588/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6589/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6590/// the upper 128-bits of the result.
6591static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6592 SDLoc DL, SelectionDAG &DAG,
6593 unsigned X86Opcode, bool Mode,
6594 bool isUndefLO, bool isUndefHI) {
6595 EVT VT = V0.getValueType();
6596 assert(VT.is256BitVector() && VT == V1.getValueType() &&((VT.is256BitVector() && VT == V1.getValueType() &&
"Invalid nodes in input!") ? static_cast<void> (0) : __assert_fail
("VT.is256BitVector() && VT == V1.getValueType() && \"Invalid nodes in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6597, __PRETTY_FUNCTION__))
6597 "Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getValueType() &&
"Invalid nodes in input!") ? static_cast<void> (0) : __assert_fail
("VT.is256BitVector() && VT == V1.getValueType() && \"Invalid nodes in input!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6597, __PRETTY_FUNCTION__))
;
6598
6599 unsigned NumElts = VT.getVectorNumElements();
6600 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6601 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6602 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6603 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6604 EVT NewVT = V0_LO.getValueType();
6605
6606 SDValue LO = DAG.getUNDEF(NewVT);
6607 SDValue HI = DAG.getUNDEF(NewVT);
6608
6609 if (Mode) {
6610 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6611 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6612 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6613 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6614 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6615 } else {
6616 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6617 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6618 V1_LO->getOpcode() != ISD::UNDEF))
6619 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6620
6621 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6622 V1_HI->getOpcode() != ISD::UNDEF))
6623 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6624 }
6625
6626 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6627}
6628
6629/// \brief Try to fold a build_vector that performs an 'addsub' into the
6630/// sequence of 'vadd + vsub + blendi'.
6631static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6632 const X86Subtarget *Subtarget) {
6633 SDLoc DL(BV);
6634 EVT VT = BV->getValueType(0);
6635 unsigned NumElts = VT.getVectorNumElements();
6636 SDValue InVec0 = DAG.getUNDEF(VT);
6637 SDValue InVec1 = DAG.getUNDEF(VT);
6638
6639 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v2f64) && "build_vector with an invalid type found!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && \"build_vector with an invalid type found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6640, __PRETTY_FUNCTION__))
6640 VT == MVT::v2f64) && "build_vector with an invalid type found!")(((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v2f64) && "build_vector with an invalid type found!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && \"build_vector with an invalid type found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6640, __PRETTY_FUNCTION__))
;
6641
6642 // Odd-numbered elements in the input build vector are obtained from
6643 // adding two integer/float elements.
6644 // Even-numbered elements in the input build vector are obtained from
6645 // subtracting two integer/float elements.
6646 unsigned ExpectedOpcode = ISD::FSUB;
6647 unsigned NextExpectedOpcode = ISD::FADD;
6648 bool AddFound = false;
6649 bool SubFound = false;
6650
6651 for (unsigned i = 0, e = NumElts; i != e; i++) {
6652 SDValue Op = BV->getOperand(i);
6653
6654 // Skip 'undef' values.
6655 unsigned Opcode = Op.getOpcode();
6656 if (Opcode == ISD::UNDEF) {
6657 std::swap(ExpectedOpcode, NextExpectedOpcode);
6658 continue;
6659 }
6660
6661 // Early exit if we found an unexpected opcode.
6662 if (Opcode != ExpectedOpcode)
6663 return SDValue();
6664
6665 SDValue Op0 = Op.getOperand(0);
6666 SDValue Op1 = Op.getOperand(1);
6667
6668 // Try to match the following pattern:
6669 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6670 // Early exit if we cannot match that sequence.
6671 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6672 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6673 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6674 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6675 Op0.getOperand(1) != Op1.getOperand(1))
6676 return SDValue();
6677
6678 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6679 if (I0 != i)
6680 return SDValue();
6681
6682 // We found a valid add/sub node. Update the information accordingly.
6683 if (i & 1)
6684 AddFound = true;
6685 else
6686 SubFound = true;
6687
6688 // Update InVec0 and InVec1.
6689 if (InVec0.getOpcode() == ISD::UNDEF)
6690 InVec0 = Op0.getOperand(0);
6691 if (InVec1.getOpcode() == ISD::UNDEF)
6692 InVec1 = Op1.getOperand(0);
6693
6694 // Make sure that operands in input to each add/sub node always
6695 // come from a same pair of vectors.
6696 if (InVec0 != Op0.getOperand(0)) {
6697 if (ExpectedOpcode == ISD::FSUB)
6698 return SDValue();
6699
6700 // FADD is commutable. Try to commute the operands
6701 // and then test again.
6702 std::swap(Op0, Op1);
6703 if (InVec0 != Op0.getOperand(0))
6704 return SDValue();
6705 }
6706
6707 if (InVec1 != Op1.getOperand(0))
6708 return SDValue();
6709
6710 // Update the pair of expected opcodes.
6711 std::swap(ExpectedOpcode, NextExpectedOpcode);
6712 }
6713
6714 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6715 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6716 InVec1.getOpcode() != ISD::UNDEF)
6717 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6718
6719 return SDValue();
6720}
6721
6722static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6723 const X86Subtarget *Subtarget) {
6724 SDLoc DL(N);
6725 EVT VT = N->getValueType(0);
6726 unsigned NumElts = VT.getVectorNumElements();
6727 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6728 SDValue InVec0, InVec1;
6729
6730 // Try to match an ADDSUB.
6731 if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6732 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6733 SDValue Value = matchAddSub(BV, DAG, Subtarget);
6734 if (Value.getNode())
6735 return Value;
6736 }
6737
6738 // Try to match horizontal ADD/SUB.
6739 unsigned NumUndefsLO = 0;
6740 unsigned NumUndefsHI = 0;
6741 unsigned Half = NumElts/2;
6742
6743 // Count the number of UNDEF operands in the build_vector in input.
6744 for (unsigned i = 0, e = Half; i != e; ++i)
6745 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6746 NumUndefsLO++;
6747
6748 for (unsigned i = Half, e = NumElts; i != e; ++i)
6749 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6750 NumUndefsHI++;
6751
6752 // Early exit if this is either a build_vector of all UNDEFs or all the
6753 // operands but one are UNDEF.
6754 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6755 return SDValue();
6756
6757 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6758 // Try to match an SSE3 float HADD/HSUB.
6759 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6760 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6761
6762 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6763 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6764 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6765 // Try to match an SSSE3 integer HADD/HSUB.
6766 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6767 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6768
6769 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6770 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6771 }
6772
6773 if (!Subtarget->hasAVX())
6774 return SDValue();
6775
6776 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6777 // Try to match an AVX horizontal add/sub of packed single/double
6778 // precision floating point values from 256-bit vectors.
6779 SDValue InVec2, InVec3;
6780 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6781 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6782 ((InVec0.getOpcode() == ISD::UNDEF ||
6783 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6784 ((InVec1.getOpcode() == ISD::UNDEF ||
6785 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6786 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6787
6788 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6789 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6790 ((InVec0.getOpcode() == ISD::UNDEF ||
6791 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6792 ((InVec1.getOpcode() == ISD::UNDEF ||
6793 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6794 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6795 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6796 // Try to match an AVX2 horizontal add/sub of signed integers.
6797 SDValue InVec2, InVec3;
6798 unsigned X86Opcode;
6799 bool CanFold = true;
6800
6801 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6802 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6803 ((InVec0.getOpcode() == ISD::UNDEF ||
6804 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6805 ((InVec1.getOpcode() == ISD::UNDEF ||
6806 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6807 X86Opcode = X86ISD::HADD;
6808 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6809 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6810 ((InVec0.getOpcode() == ISD::UNDEF ||
6811 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6812 ((InVec1.getOpcode() == ISD::UNDEF ||
6813 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6814 X86Opcode = X86ISD::HSUB;
6815 else
6816 CanFold = false;
6817
6818 if (CanFold) {
6819 // Fold this build_vector into a single horizontal add/sub.
6820 // Do this only if the target has AVX2.
6821 if (Subtarget->hasAVX2())
6822 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6823
6824 // Do not try to expand this build_vector into a pair of horizontal
6825 // add/sub if we can emit a pair of scalar add/sub.
6826 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6827 return SDValue();
6828
6829 // Convert this build_vector into a pair of horizontal binop followed by
6830 // a concat vector.
6831 bool isUndefLO = NumUndefsLO == Half;
6832 bool isUndefHI = NumUndefsHI == Half;
6833 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6834 isUndefLO, isUndefHI);
6835 }
6836 }
6837
6838 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6839 VT == MVT::v16i16) && Subtarget->hasAVX()) {
6840 unsigned X86Opcode;
6841 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6842 X86Opcode = X86ISD::HADD;
6843 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6844 X86Opcode = X86ISD::HSUB;
6845 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6846 X86Opcode = X86ISD::FHADD;
6847 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6848 X86Opcode = X86ISD::FHSUB;
6849 else
6850 return SDValue();
6851
6852 // Don't try to expand this build_vector into a pair of horizontal add/sub
6853 // if we can simply emit a pair of scalar add/sub.
6854 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6855 return SDValue();
6856
6857 // Convert this build_vector into two horizontal add/sub followed by
6858 // a concat vector.
6859 bool isUndefLO = NumUndefsLO == Half;
6860 bool isUndefHI = NumUndefsHI == Half;
6861 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6862 isUndefLO, isUndefHI);
6863 }
6864
6865 return SDValue();
6866}
6867
6868SDValue
6869X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6870 SDLoc dl(Op);
6871
6872 MVT VT = Op.getSimpleValueType();
6873 MVT ExtVT = VT.getVectorElementType();
6874 unsigned NumElems = Op.getNumOperands();
6875
6876 // Generate vectors for predicate vectors.
6877 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6878 return LowerBUILD_VECTORvXi1(Op, DAG);
6879
6880 // Vectors containing all zeros can be matched by pxor and xorps later
6881 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6882 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6883 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6884 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6885 return Op;
6886
6887 return getZeroVector(VT, Subtarget, DAG, dl);
6888 }
6889
6890 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6891 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6892 // vpcmpeqd on 256-bit vectors.
6893 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6894 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6895 return Op;
6896
6897 if (!VT.is512BitVector())
6898 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6899 }
6900
6901 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6902 if (Broadcast.getNode())
6903 return Broadcast;
6904
6905 unsigned EVTBits = ExtVT.getSizeInBits();
6906
6907 unsigned NumZero = 0;
6908 unsigned NumNonZero = 0;
6909 unsigned NonZeros = 0;
6910 bool IsAllConstants = true;
6911 SmallSet<SDValue, 8> Values;
6912 for (unsigned i = 0; i < NumElems; ++i) {
6913 SDValue Elt = Op.getOperand(i);
6914 if (Elt.getOpcode() == ISD::UNDEF)
6915 continue;
6916 Values.insert(Elt);
6917 if (Elt.getOpcode() != ISD::Constant &&
6918 Elt.getOpcode() != ISD::ConstantFP)
6919 IsAllConstants = false;
6920 if (X86::isZeroNode(Elt))
6921 NumZero++;
6922 else {
6923 NonZeros |= (1 << i);
6924 NumNonZero++;
6925 }
6926 }
6927
6928 // All undef vector. Return an UNDEF. All zero vectors were handled above.
6929 if (NumNonZero == 0)
6930 return DAG.getUNDEF(VT);
6931
6932 // Special case for single non-zero, non-undef, element.
6933 if (NumNonZero == 1) {
6934 unsigned Idx = countTrailingZeros(NonZeros);
6935 SDValue Item = Op.getOperand(Idx);
6936
6937 // If this is an insertion of an i64 value on x86-32, and if the top bits of
6938 // the value are obviously zero, truncate the value to i32 and do the
6939 // insertion that way. Only do this if the value is non-constant or if the
6940 // value is a constant being inserted into element 0. It is cheaper to do
6941 // a constant pool load than it is to do a movd + shuffle.
6942 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6943 (!IsAllConstants || Idx == 0)) {
6944 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6945 // Handle SSE only.
6946 assert(VT == MVT::v2i64 && "Expected an SSE value type!")((VT == MVT::v2i64 && "Expected an SSE value type!") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::v2i64 && \"Expected an SSE value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6946, __PRETTY_FUNCTION__))
;
6947 EVT VecVT = MVT::v4i32;
6948 unsigned VecElts = 4;
6949
6950 // Truncate the value (which may itself be a constant) to i32, and
6951 // convert it to a vector with movd (S2V+shuffle to zero extend).
6952 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6953 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6954
6955 // If using the new shuffle lowering, just directly insert this.
6956 if (ExperimentalVectorShuffleLowering)
6957 return DAG.getNode(
6958 ISD::BITCAST, dl, VT,
6959 getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6960
6961 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6962
6963 // Now we have our 32-bit value zero extended in the low element of
6964 // a vector. If Idx != 0, swizzle it into place.
6965 if (Idx != 0) {
6966 SmallVector<int, 4> Mask;
6967 Mask.push_back(Idx);
6968 for (unsigned i = 1; i != VecElts; ++i)
6969 Mask.push_back(i);
6970 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6971 &Mask[0]);
6972 }
6973 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6974 }
6975 }
6976
6977 // If we have a constant or non-constant insertion into the low element of
6978 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6979 // the rest of the elements. This will be matched as movd/movq/movss/movsd
6980 // depending on what the source datatype is.
6981 if (Idx == 0) {
6982 if (NumZero == 0)
6983 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6984
6985 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6986 (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6987 if (VT.is256BitVector() || VT.is512BitVector()) {
6988 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6989 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6990 Item, DAG.getIntPtrConstant(0));
6991 }
6992 assert(VT.is128BitVector() && "Expected an SSE value type!")((VT.is128BitVector() && "Expected an SSE value type!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 6992, __PRETTY_FUNCTION__))
;
6993 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6994 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6995 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6996 }
6997
6998 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6999 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7000 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7001 if (VT.is256BitVector()) {
7002 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7003 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7004 } else {
7005 assert(VT.is128BitVector() && "Expected an SSE value type!")((VT.is128BitVector() && "Expected an SSE value type!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7005, __PRETTY_FUNCTION__))
;
7006 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7007 }
7008 return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7009 }
7010 }
7011
7012 // Is it a vector logical left shift?
7013 if (NumElems == 2 && Idx == 1 &&
7014 X86::isZeroNode(Op.getOperand(0)) &&
7015 !X86::isZeroNode(Op.getOperand(1))) {
7016 unsigned NumBits = VT.getSizeInBits();
7017 return getVShift(true, VT,
7018 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7019 VT, Op.getOperand(1)),
7020 NumBits/2, DAG, *this, dl);
7021 }
7022
7023 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7024 return SDValue();
7025
7026 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7027 // is a non-constant being inserted into an element other than the low one,
7028 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7029 // movd/movss) to move this into the low element, then shuffle it into
7030 // place.
7031 if (EVTBits == 32) {
7032 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7033
7034 // If using the new shuffle lowering, just directly insert this.
7035 if (ExperimentalVectorShuffleLowering)
7036 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7037
7038 // Turn it into a shuffle of zero and zero-extended scalar to vector.
7039 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7040 SmallVector<int, 8> MaskVec;
7041 for (unsigned i = 0; i != NumElems; ++i)
7042 MaskVec.push_back(i == Idx ? 0 : 1);
7043 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7044 }
7045 }
7046
7047 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7048 if (Values.size() == 1) {
7049 if (EVTBits == 32) {
7050 // Instead of a shuffle like this:
7051 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7052 // Check if it's possible to issue this instead.
7053 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7054 unsigned Idx = countTrailingZeros(NonZeros);
7055 SDValue Item = Op.getOperand(Idx);
7056 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7057 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7058 }
7059 return SDValue();
7060 }
7061
7062 // A vector full of immediates; various special cases are already
7063 // handled, so this is best done with a single constant-pool load.
7064 if (IsAllConstants)
7065 return SDValue();
7066
7067 // For AVX-length vectors, see if we can use a vector load to get all of the
7068 // elements, otherwise build the individual 128-bit pieces and use
7069 // shuffles to put them in place.
7070 if (VT.is256BitVector() || VT.is512BitVector()) {
7071 SmallVector<SDValue, 64> V;
7072 for (unsigned i = 0; i != NumElems; ++i)
7073 V.push_back(Op.getOperand(i));
7074
7075 // Check for a build vector of consecutive loads.
7076 if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7077 return LD;
7078
7079 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7080
7081 // Build both the lower and upper subvector.
7082 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7083 makeArrayRef(&V[0], NumElems/2));
7084 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7085 makeArrayRef(&V[NumElems / 2], NumElems/2));
7086
7087 // Recreate the wider vector with the lower and upper part.
7088 if (VT.is256BitVector())
7089 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7090 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7091 }
7092
7093 // Let legalizer expand 2-wide build_vectors.
7094 if (EVTBits == 64) {
7095 if (NumNonZero == 1) {
7096 // One half is zero or undef.
7097 unsigned Idx = countTrailingZeros(NonZeros);
7098 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7099 Op.getOperand(Idx));
7100 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7101 }
7102 return SDValue();
7103 }
7104
7105 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7106 if (EVTBits == 8 && NumElems == 16) {
7107 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7108 Subtarget, *this);
7109 if (V.getNode()) return V;
7110 }
7111
7112 if (EVTBits == 16 && NumElems == 8) {
7113 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7114 Subtarget, *this);
7115 if (V.getNode()) return V;
7116 }
7117
7118 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7119 if (EVTBits == 32 && NumElems == 4) {
7120 SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7121 if (V.getNode())
7122 return V;
7123 }
7124
7125 // If element VT is == 32 bits, turn it into a number of shuffles.
7126 SmallVector<SDValue, 8> V(NumElems);
7127 if (NumElems == 4 && NumZero > 0) {
7128 for (unsigned i = 0; i < 4; ++i) {
7129 bool isZero = !(NonZeros & (1 << i));
7130 if (isZero)
7131 V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7132 else
7133 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7134 }
7135
7136 for (unsigned i = 0; i < 2; ++i) {
7137 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7138 default: break;
7139 case 0:
7140 V[i] = V[i*2]; // Must be a zero vector.
7141 break;
7142 case 1:
7143 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7144 break;
7145 case 2:
7146 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7147 break;
7148 case 3:
7149 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7150 break;
7151 }
7152 }
7153
7154 bool Reverse1 = (NonZeros & 0x3) == 2;
7155 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7156 int MaskVec[] = {
7157 Reverse1 ? 1 : 0,
7158 Reverse1 ? 0 : 1,
7159 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7160 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7161 };
7162 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7163 }
7164
7165 if (Values.size() > 1 && VT.is128BitVector()) {
7166 // Check for a build vector of consecutive loads.
7167 for (unsigned i = 0; i < NumElems; ++i)
7168 V[i] = Op.getOperand(i);
7169
7170 // Check for elements which are consecutive loads.
7171 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7172 if (LD.getNode())
7173 return LD;
7174
7175 // Check for a build vector from mostly shuffle plus few inserting.
7176 SDValue Sh = buildFromShuffleMostly(Op, DAG);
7177 if (Sh.getNode())
7178 return Sh;
7179
7180 // For SSE 4.1, use insertps to put the high elements into the low element.
7181 if (getSubtarget()->hasSSE41()) {
7182 SDValue Result;
7183 if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7184 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7185 else
7186 Result = DAG.getUNDEF(VT);
7187
7188 for (unsigned i = 1; i < NumElems; ++i) {
7189 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7190 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7191 Op.getOperand(i), DAG.getIntPtrConstant(i));
7192 }
7193 return Result;
7194 }
7195
7196 // Otherwise, expand into a number of unpckl*, start by extending each of
7197 // our (non-undef) elements to the full vector width with the element in the
7198 // bottom slot of the vector (which generates no code for SSE).
7199 for (unsigned i = 0; i < NumElems; ++i) {
7200 if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7201 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7202 else
7203 V[i] = DAG.getUNDEF(VT);
7204 }
7205
7206 // Next, we iteratively mix elements, e.g. for v4f32:
7207 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7208 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7209 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7210 unsigned EltStride = NumElems >> 1;
7211 while (EltStride != 0) {
7212 for (unsigned i = 0; i < EltStride; ++i) {
7213 // If V[i+EltStride] is undef and this is the first round of mixing,
7214 // then it is safe to just drop this shuffle: V[i] is already in the
7215 // right place, the one element (since it's the first round) being
7216 // inserted as undef can be dropped. This isn't safe for successive
7217 // rounds because they will permute elements within both vectors.
7218 if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7219 EltStride == NumElems/2)
7220 continue;
7221
7222 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7223 }
7224 EltStride >>= 1;
7225 }
7226 return V[0];
7227 }
7228 return SDValue();
7229}
7230
7231// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7232// to create 256-bit vectors from two other 128-bit ones.
7233static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7234 SDLoc dl(Op);
7235 MVT ResVT = Op.getSimpleValueType();
7236
7237 assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7238, __PRETTY_FUNCTION__))
7238 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7238, __PRETTY_FUNCTION__))
;
7239
7240 SDValue V1 = Op.getOperand(0);
7241 SDValue V2 = Op.getOperand(1);
7242 unsigned NumElems = ResVT.getVectorNumElements();
7243 if(ResVT.is256BitVector())
7244 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7245
7246 if (Op.getNumOperands() == 4) {
7247 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7248 ResVT.getVectorNumElements()/2);
7249 SDValue V3 = Op.getOperand(2);
7250 SDValue V4 = Op.getOperand(3);
7251 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7252 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7253 }
7254 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7255}
7256
7257static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7258 MVT LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) VT = Op.getSimpleValueType();
7259 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__))
7260 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__))
7261 Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7261, __PRETTY_FUNCTION__))
;
7262
7263 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7264 // from two other 128-bit ones.
7265
7266 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7267 return LowerAVXCONCAT_VECTORS(Op, DAG);
7268}
7269
7270
7271//===----------------------------------------------------------------------===//
7272// Vector shuffle lowering
7273//
7274// This is an experimental code path for lowering vector shuffles on x86. It is
7275// designed to handle arbitrary vector shuffles and blends, gracefully
7276// degrading performance as necessary. It works hard to recognize idiomatic
7277// shuffles and lower them to optimal instruction patterns without leaving
7278// a framework that allows reasonably efficient handling of all vector shuffle
7279// patterns.
7280//===----------------------------------------------------------------------===//
7281
7282/// \brief Tiny helper function to identify a no-op mask.
7283///
7284/// This is a somewhat boring predicate function. It checks whether the mask
7285/// array input, which is assumed to be a single-input shuffle mask of the kind
7286/// used by the X86 shuffle instructions (not a fully general
7287/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7288/// in-place shuffle are 'no-op's.
7289static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7290 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7291 if (Mask[i] != -1 && Mask[i] != i)
7292 return false;
7293 return true;
7294}
7295
7296/// \brief Helper function to classify a mask as a single-input mask.
7297///
7298/// This isn't a generic single-input test because in the vector shuffle
7299/// lowering we canonicalize single inputs to be the first input operand. This
7300/// means we can more quickly test for a single input by only checking whether
7301/// an input from the second operand exists. We also assume that the size of
7302/// mask corresponds to the size of the input vectors which isn't true in the
7303/// fully general case.
7304static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7305 for (int M : Mask)
7306 if (M >= (int)Mask.size())
7307 return false;
7308 return true;
7309}
7310
7311/// \brief Test whether there are elements crossing 128-bit lanes in this
7312/// shuffle mask.
7313///
7314/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7315/// and we routinely test for these.
7316static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7317 int LaneSize = 128 / VT.getScalarSizeInBits();
7318 int Size = Mask.size();
7319 for (int i = 0; i < Size; ++i)
7320 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7321 return true;
7322 return false;
7323}
7324
7325/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7326///
7327/// This checks a shuffle mask to see if it is performing the same
7328/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7329/// that it is also not lane-crossing. It may however involve a blend from the
7330/// same lane of a second vector.
7331///
7332/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7333/// non-trivial to compute in the face of undef lanes. The representation is
7334/// *not* suitable for use with existing 128-bit shuffles as it will contain
7335/// entries from both V1 and V2 inputs to the wider mask.
7336static bool
7337is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7338 SmallVectorImpl<int> &RepeatedMask) {
7339 int LaneSize = 128 / VT.getScalarSizeInBits();
7340 RepeatedMask.resize(LaneSize, -1);
7341 int Size = Mask.size();
7342 for (int i = 0; i < Size; ++i) {
7343 if (Mask[i] < 0)
7344 continue;
7345 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7346 // This entry crosses lanes, so there is no way to model this shuffle.
7347 return false;
7348
7349 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7350 if (RepeatedMask[i % LaneSize] == -1)
7351 // This is the first non-undef entry in this slot of a 128-bit lane.
7352 RepeatedMask[i % LaneSize] =
7353 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7354 else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7355 // Found a mismatch with the repeated mask.
7356 return false;
7357 }
7358 return true;
7359}
7360
7361// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7362// 2013 will allow us to use it as a non-type template parameter.
7363namespace {
7364
7365/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7366///
7367/// See its documentation for details.
7368bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7369 if (Mask.size() != Args.size())
7370 return false;
7371 for (int i = 0, e = Mask.size(); i < e; ++i) {
7372 assert(*Args[i] >= 0 && "Arguments must be positive integers!")((*Args[i] >= 0 && "Arguments must be positive integers!"
) ? static_cast<void> (0) : __assert_fail ("*Args[i] >= 0 && \"Arguments must be positive integers!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7372, __PRETTY_FUNCTION__))
;
7373 if (Mask[i] != -1 && Mask[i] != *Args[i])
7374 return false;
7375 }
7376 return true;
7377}
7378
7379} // namespace
7380
7381/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7382/// arguments.
7383///
7384/// This is a fast way to test a shuffle mask against a fixed pattern:
7385///
7386/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7387///
7388/// It returns true if the mask is exactly as wide as the argument list, and
7389/// each element of the mask is either -1 (signifying undef) or the value given
7390/// in the argument.
7391static const VariadicFunction1<
7392 bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7393
7394/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7395///
7396/// This helper function produces an 8-bit shuffle immediate corresponding to
7397/// the ubiquitous shuffle encoding scheme used in x86 instructions for
7398/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7399/// example.
7400///
7401/// NB: We rely heavily on "undef" masks preserving the input lane.
7402static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7403 SelectionDAG &DAG) {
7404 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7404, __PRETTY_FUNCTION__))
;
7405 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7405, __PRETTY_FUNCTION__))
;
7406 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7406, __PRETTY_FUNCTION__))
;
7407 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7407, __PRETTY_FUNCTION__))
;
7408 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7408, __PRETTY_FUNCTION__))
;
7409
7410 unsigned Imm = 0;
7411 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7412 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7413 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7414 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7415 return DAG.getConstant(Imm, MVT::i8);
7416}
7417
7418/// \brief Try to emit a blend instruction for a shuffle.
7419///
7420/// This doesn't do any checks for the availability of instructions for blending
7421/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7422/// be matched in the backend with the type given. What it does check for is
7423/// that the shuffle mask is in fact a blend.
7424static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7425 SDValue V2, ArrayRef<int> Mask,
7426 const X86Subtarget *Subtarget,
7427 SelectionDAG &DAG) {
7428
7429 unsigned BlendMask = 0;
7430 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7431 if (Mask[i] >= Size) {
7432 if (Mask[i] != i + Size)
7433 return SDValue(); // Shuffled V2 input!
7434 BlendMask |= 1u << i;
7435 continue;
7436 }
7437 if (Mask[i] >= 0 && Mask[i] != i)
7438 return SDValue(); // Shuffled V1 input!
7439 }
7440 switch (VT.SimpleTy) {
7441 case MVT::v2f64:
7442 case MVT::v4f32:
7443 case MVT::v4f64:
7444 case MVT::v8f32:
7445 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7446 DAG.getConstant(BlendMask, MVT::i8));
7447
7448 case MVT::v4i64:
7449 case MVT::v8i32:
7450 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7450, __PRETTY_FUNCTION__))
;
7451 // FALLTHROUGH
7452 case MVT::v2i64:
7453 case MVT::v4i32:
7454 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7455 // that instruction.
7456 if (Subtarget->hasAVX2()) {
7457 // Scale the blend by the number of 32-bit dwords per element.
7458 int Scale = VT.getScalarSizeInBits() / 32;
7459 BlendMask = 0;
7460 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7461 if (Mask[i] >= Size)
7462 for (int j = 0; j < Scale; ++j)
7463 BlendMask |= 1u << (i * Scale + j);
7464
7465 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7466 V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7467 V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7468 return DAG.getNode(ISD::BITCAST, DL, VT,
7469 DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7470 DAG.getConstant(BlendMask, MVT::i8)));
7471 }
7472 // FALLTHROUGH
7473 case MVT::v8i16: {
7474 // For integer shuffles we need to expand the mask and cast the inputs to
7475 // v8i16s prior to blending.
7476 int Scale = 8 / VT.getVectorNumElements();
7477 BlendMask = 0;
7478 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7479 if (Mask[i] >= Size)
7480 for (int j = 0; j < Scale; ++j)
7481 BlendMask |= 1u << (i * Scale + j);
7482
7483 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7484 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7485 return DAG.getNode(ISD::BITCAST, DL, VT,
7486 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7487 DAG.getConstant(BlendMask, MVT::i8)));
7488 }
7489
7490 case MVT::v16i16: {
7491 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7491, __PRETTY_FUNCTION__))
;
7492 SmallVector<int, 8> RepeatedMask;
7493 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7494 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7495 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7495, __PRETTY_FUNCTION__))
;
7496 BlendMask = 0;
7497 for (int i = 0; i < 8; ++i)
7498 if (RepeatedMask[i] >= 16)
7499 BlendMask |= 1u << i;
7500 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7501 DAG.getConstant(BlendMask, MVT::i8));
7502 }
7503 }
7504 // FALLTHROUGH
7505 case MVT::v32i8: {
7506 assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7506, __PRETTY_FUNCTION__))
;
7507 // Scale the blend by the number of bytes per element.
7508 int Scale = VT.getScalarSizeInBits() / 8;
7509 assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!")((Mask.size() * Scale == 32 && "Not a 256-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() * Scale == 32 && \"Not a 256-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7509, __PRETTY_FUNCTION__))
;
7510
7511 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7512 // mix of LLVM's code generator and the x86 backend. We tell the code
7513 // generator that boolean values in the elements of an x86 vector register
7514 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7515 // mapping a select to operand #1, and 'false' mapping to operand #2. The
7516 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7517 // of the element (the remaining are ignored) and 0 in that high bit would
7518 // mean operand #1 while 1 in the high bit would mean operand #2. So while
7519 // the LLVM model for boolean values in vector elements gets the relevant
7520 // bit set, it is set backwards and over constrained relative to x86's
7521 // actual model.
7522 SDValue VSELECTMask[32];
7523 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7524 for (int j = 0; j < Scale; ++j)
7525 VSELECTMask[Scale * i + j] =
7526 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7527 : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7528
7529 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7530 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7531 return DAG.getNode(
7532 ISD::BITCAST, DL, VT,
7533 DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7534 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7535 V1, V2));
7536 }
7537
7538 default:
7539 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7539)
;
7540 }
7541}
7542
7543/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7544/// unblended shuffles followed by an unshuffled blend.
7545///
7546/// This matches the extremely common pattern for handling combined
7547/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7548/// operations.
7549static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7550 SDValue V1,
7551 SDValue V2,
7552 ArrayRef<int> Mask,
7553 SelectionDAG &DAG) {
7554 // Shuffle the input elements into the desired positions in V1 and V2 and
7555 // blend them together.
7556 SmallVector<int, 32> V1Mask(Mask.size(), -1);
7557 SmallVector<int, 32> V2Mask(Mask.size(), -1);
7558 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7559 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7560 if (Mask[i] >= 0 && Mask[i] < Size) {
7561 V1Mask[i] = Mask[i];
7562 BlendMask[i] = i;
7563 } else if (Mask[i] >= Size) {
7564 V2Mask[i] = Mask[i] - Size;
7565 BlendMask[i] = i + Size;
7566 }
7567
7568 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7569 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7570 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7571}
7572
7573/// \brief Try to lower a vector shuffle as a byte rotation.
7574///
7575/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7576/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7577/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7578/// try to generically lower a vector shuffle through such an pattern. It
7579/// does not check for the profitability of lowering either as PALIGNR or
7580/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7581/// This matches shuffle vectors that look like:
7582///
7583/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7584///
7585/// Essentially it concatenates V1 and V2, shifts right by some number of
7586/// elements, and takes the low elements as the result. Note that while this is
7587/// specified as a *right shift* because x86 is little-endian, it is a *left
7588/// rotate* of the vector lanes.
7589///
7590/// Note that this only handles 128-bit vector widths currently.
7591static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7592 SDValue V2,
7593 ArrayRef<int> Mask,
7594 const X86Subtarget *Subtarget,
7595 SelectionDAG &DAG) {
7596 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7596, __PRETTY_FUNCTION__))
;
7597
7598 // We need to detect various ways of spelling a rotation:
7599 // [11, 12, 13, 14, 15, 0, 1, 2]
7600 // [-1, 12, 13, 14, -1, -1, 1, -1]
7601 // [-1, -1, -1, -1, -1, -1, 1, 2]
7602 // [ 3, 4, 5, 6, 7, 8, 9, 10]
7603 // [-1, 4, 5, 6, -1, -1, 9, -1]
7604 // [-1, 4, 5, 6, -1, -1, -1, -1]
7605 int Rotation = 0;
7606 SDValue Lo, Hi;
7607 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7608 if (Mask[i] == -1)
7609 continue;
7610 assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!")((Mask[i] >= 0 && "Only -1 is a valid negative mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] >= 0 && \"Only -1 is a valid negative mask element!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7610, __PRETTY_FUNCTION__))
;
7611
7612 // Based on the mod-Size value of this mask element determine where
7613 // a rotated vector would have started.
7614 int StartIdx = i - (Mask[i] % Size);
7615 if (StartIdx == 0)
7616 // The identity rotation isn't interesting, stop.
7617 return SDValue();
7618
7619 // If we found the tail of a vector the rotation must be the missing
7620 // front. If we found the head of a vector, it must be how much of the head.
7621 int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7622
7623 if (Rotation == 0)
7624 Rotation = CandidateRotation;
7625 else if (Rotation != CandidateRotation)
7626 // The rotations don't match, so we can't match this mask.
7627 return SDValue();
7628
7629 // Compute which value this mask is pointing at.
7630 SDValue MaskV = Mask[i] < Size ? V1 : V2;
7631
7632 // Compute which of the two target values this index should be assigned to.
7633 // This reflects whether the high elements are remaining or the low elements
7634 // are remaining.
7635 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7636
7637 // Either set up this value if we've not encountered it before, or check
7638 // that it remains consistent.
7639 if (!TargetV)
7640 TargetV = MaskV;
7641 else if (TargetV != MaskV)
7642 // This may be a rotation, but it pulls from the inputs in some
7643 // unsupported interleaving.
7644 return SDValue();
7645 }
7646
7647 // Check that we successfully analyzed the mask, and normalize the results.
7648 assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7648, __PRETTY_FUNCTION__))
;
7649 assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7649, __PRETTY_FUNCTION__))
;
7650 if (!Lo)
7651 Lo = Hi;
7652 else if (!Hi)
7653 Hi = Lo;
7654
7655 assert(VT.getSizeInBits() == 128 &&((VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 128 && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7656, __PRETTY_FUNCTION__))
7656 "Rotate-based lowering only supports 128-bit lowering!")((VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 128 && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7656, __PRETTY_FUNCTION__))
;
7657 assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7658, __PRETTY_FUNCTION__))
7658 "Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7658, __PRETTY_FUNCTION__))
;
7659
7660 // The actual rotate instruction rotates bytes, so we need to scale the
7661 // rotation based on how many bytes are in the vector.
7662 int Scale = 16 / Mask.size();
7663
7664 // SSSE3 targets can use the palignr instruction
7665 if (Subtarget->hasSSSE3()) {
7666 // Cast the inputs to v16i8 to match PALIGNR.
7667 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7668 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7669
7670 return DAG.getNode(ISD::BITCAST, DL, VT,
7671 DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7672 DAG.getConstant(Rotation * Scale, MVT::i8)));
7673 }
7674
7675 // Default SSE2 implementation
7676 int LoByteShift = 16 - Rotation * Scale;
7677 int HiByteShift = Rotation * Scale;
7678
7679 // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7680 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7681 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7682
7683 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7684 DAG.getConstant(8 * LoByteShift, MVT::i8));
7685 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7686 DAG.getConstant(8 * HiByteShift, MVT::i8));
7687 return DAG.getNode(ISD::BITCAST, DL, VT,
7688 DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7689}
7690
7691/// \brief Compute whether each element of a shuffle is zeroable.
7692///
7693/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7694/// Either it is an undef element in the shuffle mask, the element of the input
7695/// referenced is undef, or the element of the input referenced is known to be
7696/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7697/// as many lanes with this technique as possible to simplify the remaining
7698/// shuffle.
7699static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7700 SDValue V1, SDValue V2) {
7701 SmallBitVector Zeroable(Mask.size(), false);
7702
7703 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7704 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7705
7706 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7707 int M = Mask[i];
7708 // Handle the easy cases.
7709 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7710 Zeroable[i] = true;
7711 continue;
7712 }
7713
7714 // If this is an index into a build_vector node, dig out the input value and
7715 // use it.
7716 SDValue V = M < Size ? V1 : V2;
7717 if (V.getOpcode() != ISD::BUILD_VECTOR)
7718 continue;
7719
7720 SDValue Input = V.getOperand(M % Size);
7721 // The UNDEF opcode check really should be dead code here, but not quite
7722 // worth asserting on (it isn't invalid, just unexpected).
7723 if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7724 Zeroable[i] = true;
7725 }
7726
7727 return Zeroable;
7728}
7729
7730/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7731///
7732/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7733/// byte-shift instructions. The mask must consist of a shifted sequential
7734/// shuffle from one of the input vectors and zeroable elements for the
7735/// remaining 'shifted in' elements.
7736///
7737/// Note that this only handles 128-bit vector widths currently.
7738static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7739 SDValue V2, ArrayRef<int> Mask,
7740 SelectionDAG &DAG) {
7741 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7741, __PRETTY_FUNCTION__))
;
7742
7743 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7744
7745 int Size = Mask.size();
7746 int Scale = 16 / Size;
7747
7748 for (int Shift = 1; Shift < Size; Shift++) {
7749 int ByteShift = Shift * Scale;
7750
7751 // PSRLDQ : (little-endian) right byte shift
7752 // [ 5, 6, 7, zz, zz, zz, zz, zz]
7753 // [ -1, 5, 6, 7, zz, zz, zz, zz]
7754 // [ 1, 2, -1, -1, -1, -1, zz, zz]
7755 bool ZeroableRight = true;
7756 for (int i = Size - Shift; i < Size; i++) {
7757 ZeroableRight &= Zeroable[i];
7758 }
7759
7760 if (ZeroableRight) {
7761 bool ValidShiftRight1 =
7762 isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7763 bool ValidShiftRight2 =
7764 isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7765
7766 if (ValidShiftRight1 || ValidShiftRight2) {
7767 // Cast the inputs to v2i64 to match PSRLDQ.
7768 SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7769 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7770 SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7771 DAG.getConstant(ByteShift * 8, MVT::i8));
7772 return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7773 }
7774 }
7775
7776 // PSLLDQ : (little-endian) left byte shift
7777 // [ zz, 0, 1, 2, 3, 4, 5, 6]
7778 // [ zz, zz, -1, -1, 2, 3, 4, -1]
7779 // [ zz, zz, zz, zz, zz, zz, -1, 1]
7780 bool ZeroableLeft = true;
7781 for (int i = 0; i < Shift; i++) {
7782 ZeroableLeft &= Zeroable[i];
7783 }
7784
7785 if (ZeroableLeft) {
7786 bool ValidShiftLeft1 =
7787 isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7788 bool ValidShiftLeft2 =
7789 isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7790
7791 if (ValidShiftLeft1 || ValidShiftLeft2) {
7792 // Cast the inputs to v2i64 to match PSLLDQ.
7793 SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7794 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7795 SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7796 DAG.getConstant(ByteShift * 8, MVT::i8));
7797 return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7798 }
7799 }
7800 }
7801
7802 return SDValue();
7803}
7804
7805/// \brief Lower a vector shuffle as a zero or any extension.
7806///
7807/// Given a specific number of elements, element bit width, and extension
7808/// stride, produce either a zero or any extension based on the available
7809/// features of the subtarget.
7810static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7811 SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
7812 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7813 assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7813, __PRETTY_FUNCTION__))
;
7814 int EltBits = VT.getSizeInBits() / NumElements;
7815 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7816, __PRETTY_FUNCTION__))
7816 "Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7816, __PRETTY_FUNCTION__))
;
7817 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7817, __PRETTY_FUNCTION__))
;
7818
7819 // Found a valid zext mask! Try various lowering strategies based on the
7820 // input type and available ISA extensions.
7821 if (Subtarget->hasSSE41()) {
7822 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7823 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7824 NumElements / Scale);
7825 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7826 return DAG.getNode(ISD::BITCAST, DL, VT,
7827 DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7828 }
7829
7830 // For any extends we can cheat for larger element sizes and use shuffle
7831 // instructions that can fold with a load and/or copy.
7832 if (AnyExt && EltBits == 32) {
7833 int PSHUFDMask[4] = {0, -1, 1, -1};
7834 return DAG.getNode(
7835 ISD::BITCAST, DL, VT,
7836 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7837 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7838 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7839 }
7840 if (AnyExt && EltBits == 16 && Scale > 2) {
7841 int PSHUFDMask[4] = {0, -1, 0, -1};
7842 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7843 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7844 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7845 int PSHUFHWMask[4] = {1, -1, -1, -1};
7846 return DAG.getNode(
7847 ISD::BITCAST, DL, VT,
7848 DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7849 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7850 getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7851 }
7852
7853 // If this would require more than 2 unpack instructions to expand, use
7854 // pshufb when available. We can only use more than 2 unpack instructions
7855 // when zero extending i8 elements which also makes it easier to use pshufb.
7856 if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7857 assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7857, __PRETTY_FUNCTION__))
;
7858 SDValue PSHUFBMask[16];
7859 for (int i = 0; i < 16; ++i)
7860 PSHUFBMask[i] =
7861 DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7862 InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7863 return DAG.getNode(ISD::BITCAST, DL, VT,
7864 DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7865 DAG.getNode(ISD::BUILD_VECTOR, DL,
7866 MVT::v16i8, PSHUFBMask)));
7867 }
7868
7869 // Otherwise emit a sequence of unpacks.
7870 do {
7871 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7872 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7873 : getZeroVector(InputVT, Subtarget, DAG, DL);
7874 InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7875 InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7876 Scale /= 2;
7877 EltBits *= 2;
7878 NumElements /= 2;
7879 } while (Scale > 1);
7880 return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7881}
7882
7883/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
7884///
7885/// This routine will try to do everything in its power to cleverly lower
7886/// a shuffle which happens to match the pattern of a zero extend. It doesn't
7887/// check for the profitability of this lowering, it tries to aggressively
7888/// match this pattern. It will use all of the micro-architectural details it
7889/// can to emit an efficient lowering. It handles both blends with all-zero
7890/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
7891/// masking out later).
7892///
7893/// The reason we have dedicated lowering for zext-style shuffles is that they
7894/// are both incredibly common and often quite performance sensitive.
7895static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
7896 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7897 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7898 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7899
7900 int Bits = VT.getSizeInBits();
7901 int NumElements = Mask.size();
7902
7903 // Define a helper function to check a particular ext-scale and lower to it if
7904 // valid.
7905 auto Lower = [&](int Scale) -> SDValue {
7906 SDValue InputV;
7907 bool AnyExt = true;
7908 for (int i = 0; i < NumElements; ++i) {
7909 if (Mask[i] == -1)
7910 continue; // Valid anywhere but doesn't tell us anything.
7911 if (i % Scale != 0) {
7912 // Each of the extended elements need to be zeroable.
7913 if (!Zeroable[i])
7914 return SDValue();
7915
7916 // We no longer are in the anyext case.
7917 AnyExt = false;
7918 continue;
7919 }
7920
7921 // Each of the base elements needs to be consecutive indices into the
7922 // same input vector.
7923 SDValue V = Mask[i] < NumElements ? V1 : V2;
7924 if (!InputV)
7925 InputV = V;
7926 else if (InputV != V)
7927 return SDValue(); // Flip-flopping inputs.
7928
7929 if (Mask[i] % NumElements != i / Scale)
7930 return SDValue(); // Non-consecutive strided elements.
7931 }
7932
7933 // If we fail to find an input, we have a zero-shuffle which should always
7934 // have already been handled.
7935 // FIXME: Maybe handle this here in case during blending we end up with one?
7936 if (!InputV)
7937 return SDValue();
7938
7939 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7940 DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
7941 };
7942
7943 // The widest scale possible for extending is to a 64-bit integer.
7944 assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7945, __PRETTY_FUNCTION__))
7945 "The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7945, __PRETTY_FUNCTION__))
;
7946 int NumExtElements = Bits / 64;
7947
7948 // Each iteration, try extending the elements half as much, but into twice as
7949 // many elements.
7950 for (; NumExtElements < NumElements; NumExtElements *= 2) {
7951 assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7952, __PRETTY_FUNCTION__))
7952 "The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 7952, __PRETTY_FUNCTION__))
;
7953 if (SDValue V = Lower(NumElements / NumExtElements))
7954 return V;
7955 }
7956
7957 // No viable ext lowering found.
7958 return SDValue();
7959}
7960
7961/// \brief Try to get a scalar value for a specific element of a vector.
7962///
7963/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7964static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7965 SelectionDAG &DAG) {
7966 MVT VT = V.getSimpleValueType();
7967 MVT EltVT = VT.getVectorElementType();
7968 while (V.getOpcode() == ISD::BITCAST)
7969 V = V.getOperand(0);
7970 // If the bitcasts shift the element size, we can't extract an equivalent
7971 // element from it.
7972 MVT NewVT = V.getSimpleValueType();
7973 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
7974 return SDValue();
7975
7976 if (V.getOpcode() == ISD::BUILD_VECTOR ||
7977 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
7978 return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
7979
7980 return SDValue();
7981}
7982
7983/// \brief Helper to test for a load that can be folded with x86 shuffles.
7984///
7985/// This is particularly important because the set of instructions varies
7986/// significantly based on whether the operand is a load or not.
7987static bool isShuffleFoldableLoad(SDValue V) {
7988 while (V.getOpcode() == ISD::BITCAST)
7989 V = V.getOperand(0);
7990
7991 return ISD::isNON_EXTLoad(V.getNode());
7992}
7993
7994/// \brief Try to lower insertion of a single element into a zero vector.
7995///
7996/// This is a common pattern that we have especially efficient patterns to lower
7997/// across all subtarget feature sets.
7998static SDValue lowerVectorShuffleAsElementInsertion(
7999 MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8000 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8001 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8002 MVT ExtVT = VT;
8003 MVT EltVT = VT.getVectorElementType();
8004
8005 int V2Index = std::find_if(Mask.begin(), Mask.end(),
8006 [&Mask](int M) { return M >= (int)Mask.size(); }) -
8007 Mask.begin();
8008 bool IsV1Zeroable = true;
8009 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8010 if (i != V2Index && !Zeroable[i]) {
8011 IsV1Zeroable = false;
8012 break;
8013 }
8014
8015 // Check for a single input from a SCALAR_TO_VECTOR node.
8016 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8017 // all the smarts here sunk into that routine. However, the current
8018 // lowering of BUILD_VECTOR makes that nearly impossible until the old
8019 // vector shuffle lowering is dead.
8020 if (SDValue V2S = getScalarValueForVectorElement(
8021 V2, Mask[V2Index] - Mask.size(), DAG)) {
8022 // We need to zext the scalar if it is smaller than an i32.
8023 V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8024 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8025 // Using zext to expand a narrow element won't work for non-zero
8026 // insertions.
8027 if (!IsV1Zeroable)
8028 return SDValue();
8029
8030 // Zero-extend directly to i32.
8031 ExtVT = MVT::v4i32;
8032 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8033 }
8034 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8035 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8036 EltVT == MVT::i16) {
8037 // Either not inserting from the low element of the input or the input
8038 // element size is too small to use VZEXT_MOVL to clear the high bits.
8039 return SDValue();
8040 }
8041
8042 if (!IsV1Zeroable) {
8043 // If V1 can't be treated as a zero vector we have fewer options to lower
8044 // this. We can't support integer vectors or non-zero targets cheaply, and
8045 // the V1 elements can't be permuted in any way.
8046 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8046, __PRETTY_FUNCTION__))
;
8047 if (!VT.isFloatingPoint() || V2Index != 0)
8048 return SDValue();
8049 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8050 V1Mask[V2Index] = -1;
8051 if (!isNoopShuffleMask(V1Mask))
8052 return SDValue();
8053 // This is essentially a special case blend operation, but if we have
8054 // general purpose blend operations, they are always faster. Bail and let
8055 // the rest of the lowering handle these as blends.
8056 if (Subtarget->hasSSE41())
8057 return SDValue();
8058
8059 // Otherwise, use MOVSD or MOVSS.
8060 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8061, __PRETTY_FUNCTION__))
8061 "Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8061, __PRETTY_FUNCTION__))
;
8062 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8063 ExtVT, V1, V2);
8064 }
8065
8066 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8067 if (ExtVT != VT)
8068 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8069
8070 if (V2Index != 0) {
8071 // If we have 4 or fewer lanes we can cheaply shuffle the element into
8072 // the desired position. Otherwise it is more efficient to do a vector
8073 // shift left. We know that we can do a vector shift left because all
8074 // the inputs are zero.
8075 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8076 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8077 V2Shuffle[V2Index] = 0;
8078 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8079 } else {
8080 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8081 V2 = DAG.getNode(
8082 X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8083 DAG.getConstant(
8084 V2Index * EltVT.getSizeInBits(),
8085 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8086 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8087 }
8088 }
8089 return V2;
8090}
8091
8092/// \brief Try to lower broadcast of a single element.
8093///
8094/// For convenience, this code also bundles all of the subtarget feature set
8095/// filtering. While a little annoying to re-dispatch on type here, there isn't
8096/// a convenient way to factor it out.
8097static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8098 ArrayRef<int> Mask,
8099 const X86Subtarget *Subtarget,
8100 SelectionDAG &DAG) {
8101 if (!Subtarget->hasAVX())
8102 return SDValue();
8103 if (VT.isInteger() && !Subtarget->hasAVX2())
8104 return SDValue();
8105
8106 // Check that the mask is a broadcast.
8107 int BroadcastIdx = -1;
8108 for (int M : Mask)
8109 if (M >= 0 && BroadcastIdx == -1)
8110 BroadcastIdx = M;
8111 else if (M >= 0 && M != BroadcastIdx)
8112 return SDValue();
8113
8114 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__))
8115 "a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__))
8116 "comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8116, __PRETTY_FUNCTION__))
;
8117
8118 // Go up the chain of (vector) values to try and find a scalar load that
8119 // we can combine with the broadcast.
8120 for (;;) {
8121 switch (V.getOpcode()) {
8122 case ISD::CONCAT_VECTORS: {
8123 int OperandSize = Mask.size() / V.getNumOperands();
8124 V = V.getOperand(BroadcastIdx / OperandSize);
8125 BroadcastIdx %= OperandSize;
8126 continue;
8127 }
8128
8129 case ISD::INSERT_SUBVECTOR: {
8130 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8131 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8132 if (!ConstantIdx)
8133 break;
8134
8135 int BeginIdx = (int)ConstantIdx->getZExtValue();
8136 int EndIdx =
8137 BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8138 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8139 BroadcastIdx -= BeginIdx;
8140 V = VInner;
8141 } else {
8142 V = VOuter;
8143 }
8144 continue;
8145 }
8146 }
8147 break;
8148 }
8149
8150 // Check if this is a broadcast of a scalar. We special case lowering
8151 // for scalars so that we can more effectively fold with loads.
8152 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8153 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8154 V = V.getOperand(BroadcastIdx);
8155
8156 // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8157 // AVX2.
8158 if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8159 return SDValue();
8160 } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8161 // We can't broadcast from a vector register w/o AVX2, and we can only
8162 // broadcast from the zero-element of a vector register.
8163 return SDValue();
8164 }
8165
8166 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8167}
8168
8169// Check for whether we can use INSERTPS to perform the shuffle. We only use
8170// INSERTPS when the V1 elements are already in the correct locations
8171// because otherwise we can just always use two SHUFPS instructions which
8172// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8173// perform INSERTPS if a single V1 element is out of place and all V2
8174// elements are zeroable.
8175static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8176 ArrayRef<int> Mask,
8177 SelectionDAG &DAG) {
8178 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v4f32 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8178, __PRETTY_FUNCTION__))
;
8179 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8179, __PRETTY_FUNCTION__))
;
8180 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8180, __PRETTY_FUNCTION__))
;
8181 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8181, __PRETTY_FUNCTION__))
;
8182
8183 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8184
8185 unsigned ZMask = 0;
8186 int V1DstIndex = -1;
8187 int V2DstIndex = -1;
8188 bool V1UsedInPlace = false;
8189
8190 for (int i = 0; i < 4; i++) {
8191 // Synthesize a zero mask from the zeroable elements (includes undefs).
8192 if (Zeroable[i]) {
8193 ZMask |= 1 << i;
8194 continue;
8195 }
8196
8197 // Flag if we use any V1 inputs in place.
8198 if (i == Mask[i]) {
8199 V1UsedInPlace = true;
8200 continue;
8201 }
8202
8203 // We can only insert a single non-zeroable element.
8204 if (V1DstIndex != -1 || V2DstIndex != -1)
8205 return SDValue();
8206
8207 if (Mask[i] < 4) {
8208 // V1 input out of place for insertion.
8209 V1DstIndex = i;
8210 } else {
8211 // V2 input for insertion.
8212 V2DstIndex = i;
8213 }
8214 }
8215
8216 // Don't bother if we have no (non-zeroable) element for insertion.
8217 if (V1DstIndex == -1 && V2DstIndex == -1)
8218 return SDValue();
8219
8220 // Determine element insertion src/dst indices. The src index is from the
8221 // start of the inserted vector, not the start of the concatenated vector.
8222 unsigned V2SrcIndex = 0;
8223 if (V1DstIndex != -1) {
8224 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8225 // and don't use the original V2 at all.
8226 V2SrcIndex = Mask[V1DstIndex];
8227 V2DstIndex = V1DstIndex;
8228 V2 = V1;
8229 } else {
8230 V2SrcIndex = Mask[V2DstIndex] - 4;
8231 }
8232
8233 // If no V1 inputs are used in place, then the result is created only from
8234 // the zero mask and the V2 insertion - so remove V1 dependency.
8235 if (!V1UsedInPlace)
8236 V1 = DAG.getUNDEF(MVT::v4f32);
8237
8238 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8239 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8239, __PRETTY_FUNCTION__))
;
8240
8241 // Insert the V2 element into the desired position.
8242 SDLoc DL(Op);
8243 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8244 DAG.getConstant(InsertPSMask, MVT::i8));
8245}
8246
8247/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8248///
8249/// This is the basis function for the 2-lane 64-bit shuffles as we have full
8250/// support for floating point shuffles but not integer shuffles. These
8251/// instructions will incur a domain crossing penalty on some chips though so
8252/// it is better to avoid lowering through this for integer vectors where
8253/// possible.
8254static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8255 const X86Subtarget *Subtarget,
8256 SelectionDAG &DAG) {
8257 SDLoc DL(Op);
8258 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v2f64 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8258, __PRETTY_FUNCTION__))
;
8259 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8259, __PRETTY_FUNCTION__))
;
8260 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8260, __PRETTY_FUNCTION__))
;
8261 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8262 ArrayRef<int> Mask = SVOp->getMask();
8263 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8263, __PRETTY_FUNCTION__))
;
8264
8265 if (isSingleInputShuffleMask(Mask)) {
8266 // Use low duplicate instructions for masks that match their pattern.
8267 if (Subtarget->hasSSE3())
8268 if (isShuffleEquivalent(Mask, 0, 0))
8269 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8270
8271 // Straight shuffle of a single input vector. Simulate this by using the
8272 // single input as both of the "inputs" to this instruction..
8273 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8274
8275 if (Subtarget->hasAVX()) {
8276 // If we have AVX, we can use VPERMILPS which will allow folding a load
8277 // into the shuffle.
8278 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8279 DAG.getConstant(SHUFPDMask, MVT::i8));
8280 }
8281
8282 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8283 DAG.getConstant(SHUFPDMask, MVT::i8));
8284 }
8285 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!")((Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && Mask[0] < 2 && \"Non-canonicalized blend!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8285, __PRETTY_FUNCTION__))
;
8286 assert(Mask[1] >= 2 && "Non-canonicalized blend!")((Mask[1] >= 2 && "Non-canonicalized blend!") ? static_cast
<void> (0) : __assert_fail ("Mask[1] >= 2 && \"Non-canonicalized blend!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8286, __PRETTY_FUNCTION__))
;
8287
8288 // Use dedicated unpack instructions for masks that match their pattern.
8289 if (isShuffleEquivalent(Mask, 0, 2))
8290 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8291 if (isShuffleEquivalent(Mask, 1, 3))
8292 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8293
8294 // If we have a single input, insert that into V1 if we can do so cheaply.
8295 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8296 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8297 MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8298 return Insertion;
8299 // Try inverting the insertion since for v2 masks it is easy to do and we
8300 // can't reliably sort the mask one way or the other.
8301 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8302 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8303 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8304 MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8305 return Insertion;
8306 }
8307
8308 // Try to use one of the special instruction patterns to handle two common
8309 // blend patterns if a zero-blend above didn't work.
8310 if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8311 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8312 // We can either use a special instruction to load over the low double or
8313 // to move just the low double.
8314 return DAG.getNode(
8315 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8316 DL, MVT::v2f64, V2,
8317 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8318
8319 if (Subtarget->hasSSE41())
8320 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8321 Subtarget, DAG))
8322 return Blend;
8323
8324 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8325 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8326 DAG.getConstant(SHUFPDMask, MVT::i8));
8327}
8328
8329/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8330///
8331/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8332/// the integer unit to minimize domain crossing penalties. However, for blends
8333/// it falls back to the floating point shuffle operation with appropriate bit
8334/// casting.
8335static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8336 const X86Subtarget *Subtarget,
8337 SelectionDAG &DAG) {
8338 SDLoc DL(Op);
8339 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v2i64 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8339, __PRETTY_FUNCTION__))
;
8340 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8340, __PRETTY_FUNCTION__))
;
8341 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8341, __PRETTY_FUNCTION__))
;
8342 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8343 ArrayRef<int> Mask = SVOp->getMask();
8344 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8344, __PRETTY_FUNCTION__))
;
8345
8346 if (isSingleInputShuffleMask(Mask)) {
8347 // Check for being able to broadcast a single element.
8348 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8349 Mask, Subtarget, DAG))
8350 return Broadcast;
8351
8352 // Straight shuffle of a single input vector. For everything from SSE2
8353 // onward this has a single fast instruction with no scary immediates.
8354 // We have to map the mask as it is actually a v4i32 shuffle instruction.
8355 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8356 int WidenedMask[4] = {
8357 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8358 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8359 return DAG.getNode(
8360 ISD::BITCAST, DL, MVT::v2i64,
8361 DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8362 getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8363 }
8364
8365 // Try to use byte shift instructions.
8366 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8367 DL, MVT::v2i64, V1, V2, Mask, DAG))
8368 return Shift;
8369
8370 // If we have a single input from V2 insert that into V1 if we can do so
8371 // cheaply.
8372 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8373 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8374 MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8375 return Insertion;
8376 // Try inverting the insertion since for v2 masks it is easy to do and we
8377 // can't reliably sort the mask one way or the other.
8378 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8379 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8380 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8381 MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8382 return Insertion;
8383 }
8384
8385 // Use dedicated unpack instructions for masks that match their pattern.
8386 if (isShuffleEquivalent(Mask, 0, 2))
8387 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8388 if (isShuffleEquivalent(Mask, 1, 3))
8389 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8390
8391 if (Subtarget->hasSSE41())
8392 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8393 Subtarget, DAG))
8394 return Blend;
8395
8396 // Try to use byte rotation instructions.
8397 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8398 if (Subtarget->hasSSSE3())
8399 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8400 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8401 return Rotate;
8402
8403 // We implement this with SHUFPD which is pretty lame because it will likely
8404 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8405 // However, all the alternatives are still more cycles and newer chips don't
8406 // have this problem. It would be really nice if x86 had better shuffles here.
8407 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8408 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8409 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8410 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8411}
8412
8413/// \brief Lower a vector shuffle using the SHUFPS instruction.
8414///
8415/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8416/// It makes no assumptions about whether this is the *best* lowering, it simply
8417/// uses it.
8418static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8419 ArrayRef<int> Mask, SDValue V1,
8420 SDValue V2, SelectionDAG &DAG) {
8421 SDValue LowV = V1, HighV = V2;
8422 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8423
8424 int NumV2Elements =
8425 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8426
8427 if (NumV2Elements == 1) {
8428 int V2Index =
8429 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8430 Mask.begin();
8431
8432 // Compute the index adjacent to V2Index and in the same half by toggling
8433 // the low bit.
8434 int V2AdjIndex = V2Index ^ 1;
8435
8436 if (Mask[V2AdjIndex] == -1) {
8437 // Handles all the cases where we have a single V2 element and an undef.
8438 // This will only ever happen in the high lanes because we commute the
8439 // vector otherwise.
8440 if (V2Index < 2)
8441 std::swap(LowV, HighV);
8442 NewMask[V2Index] -= 4;
8443 } else {
8444 // Handle the case where the V2 element ends up adjacent to a V1 element.
8445 // To make this work, blend them together as the first step.
8446 int V1Index = V2AdjIndex;
8447 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8448 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8449 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8450
8451 // Now proceed to reconstruct the final blend as we have the necessary
8452 // high or low half formed.
8453 if (V2Index < 2) {
8454 LowV = V2;
8455 HighV = V1;
8456 } else {
8457 HighV = V2;
8458 }
8459 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8460 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8461 }
8462 } else if (NumV2Elements == 2) {
8463 if (Mask[0] < 4 && Mask[1] < 4) {
8464 // Handle the easy case where we have V1 in the low lanes and V2 in the
8465 // high lanes.
8466 NewMask[2] -= 4;
8467 NewMask[3] -= 4;
8468 } else if (Mask[2] < 4 && Mask[3] < 4) {
8469 // We also handle the reversed case because this utility may get called
8470 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8471 // arrange things in the right direction.
8472 NewMask[0] -= 4;
8473 NewMask[1] -= 4;
8474 HighV = V1;
8475 LowV = V2;
8476 } else {
8477 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8478 // trying to place elements directly, just blend them and set up the final
8479 // shuffle to place them.
8480
8481 // The first two blend mask elements are for V1, the second two are for
8482 // V2.
8483 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8484 Mask[2] < 4 ? Mask[2] : Mask[3],
8485 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8486 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8487 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8488 getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8489
8490 // Now we do a normal shuffle of V1 by giving V1 as both operands to
8491 // a blend.
8492 LowV = HighV = V1;
8493 NewMask[0] = Mask[0] < 4 ? 0 : 2;
8494 NewMask[1] = Mask[0] < 4 ? 2 : 0;
8495 NewMask[2] = Mask[2] < 4 ? 1 : 3;
8496 NewMask[3] = Mask[2] < 4 ? 3 : 1;
8497 }
8498 }
8499 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8500 getV4X86ShuffleImm8ForMask(NewMask, DAG));
8501}
8502
8503/// \brief Lower 4-lane 32-bit floating point shuffles.
8504///
8505/// Uses instructions exclusively from the floating point unit to minimize
8506/// domain crossing penalties, as these are sufficient to implement all v4f32
8507/// shuffles.
8508static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8509 const X86Subtarget *Subtarget,
8510 SelectionDAG &DAG) {
8511 SDLoc DL(Op);
8512 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v4f32 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8512, __PRETTY_FUNCTION__))
;
8513 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8513, __PRETTY_FUNCTION__))
;
8514 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8514, __PRETTY_FUNCTION__))
;
8515 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8516 ArrayRef<int> Mask = SVOp->getMask();
8517 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8517, __PRETTY_FUNCTION__))
;
8518
8519 int NumV2Elements =
8520 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8521
8522 if (NumV2Elements == 0) {
8523 // Check for being able to broadcast a single element.
8524 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8525 Mask, Subtarget, DAG))
8526 return Broadcast;
8527
8528 // Use even/odd duplicate instructions for masks that match their pattern.
8529 if (Subtarget->hasSSE3()) {
8530 if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8531 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8532 if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8533 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8534 }
8535
8536 if (Subtarget->hasAVX()) {
8537 // If we have AVX, we can use VPERMILPS which will allow folding a load
8538 // into the shuffle.
8539 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8540 getV4X86ShuffleImm8ForMask(Mask, DAG));
8541 }
8542
8543 // Otherwise, use a straight shuffle of a single input vector. We pass the
8544 // input vector to both operands to simulate this with a SHUFPS.
8545 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8546 getV4X86ShuffleImm8ForMask(Mask, DAG));
8547 }
8548
8549 // Use dedicated unpack instructions for masks that match their pattern.
8550 if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8551 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8552 if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8553 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8554
8555 // There are special ways we can lower some single-element blends. However, we
8556 // have custom ways we can lower more complex single-element blends below that
8557 // we defer to if both this and BLENDPS fail to match, so restrict this to
8558 // when the V2 input is targeting element 0 of the mask -- that is the fast
8559 // case here.
8560 if (NumV2Elements == 1 && Mask[0] >= 4)
8561 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8562 Mask, Subtarget, DAG))
8563 return V;
8564
8565 if (Subtarget->hasSSE41()) {
8566 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8567 Subtarget, DAG))
8568 return Blend;
8569
8570 // Use INSERTPS if we can complete the shuffle efficiently.
8571 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8572 return V;
8573 }
8574
8575 // Otherwise fall back to a SHUFPS lowering strategy.
8576 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8577}
8578
8579/// \brief Lower 4-lane i32 vector shuffles.
8580///
8581/// We try to handle these with integer-domain shuffles where we can, but for
8582/// blends we use the floating point domain blend instructions.
8583static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8584 const X86Subtarget *Subtarget,
8585 SelectionDAG &DAG) {
8586 SDLoc DL(Op);
8587 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v4i32 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8587, __PRETTY_FUNCTION__))
;
8588 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8588, __PRETTY_FUNCTION__))
;
8589 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8589, __PRETTY_FUNCTION__))
;
8590 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8591 ArrayRef<int> Mask = SVOp->getMask();
8592 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8592, __PRETTY_FUNCTION__))
;
8593
8594 // Whenever we can lower this as a zext, that instruction is strictly faster
8595 // than any alternative. It also allows us to fold memory operands into the
8596 // shuffle in many cases.
8597 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8598 Mask, Subtarget, DAG))
8599 return ZExt;
8600
8601 int NumV2Elements =
8602 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8603
8604 if (NumV2Elements == 0) {
8605 // Check for being able to broadcast a single element.
8606 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8607 Mask, Subtarget, DAG))
8608 return Broadcast;
8609
8610 // Straight shuffle of a single input vector. For everything from SSE2
8611 // onward this has a single fast instruction with no scary immediates.
8612 // We coerce the shuffle pattern to be compatible with UNPCK instructions
8613 // but we aren't actually going to use the UNPCK instruction because doing
8614 // so prevents folding a load into this instruction or making a copy.
8615 const int UnpackLoMask[] = {0, 0, 1, 1};
8616 const int UnpackHiMask[] = {2, 2, 3, 3};
8617 if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8618 Mask = UnpackLoMask;
8619 else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8620 Mask = UnpackHiMask;
8621
8622 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8623 getV4X86ShuffleImm8ForMask(Mask, DAG));
8624 }
8625
8626 // Try to use byte shift instructions.
8627 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8628 DL, MVT::v4i32, V1, V2, Mask, DAG))
8629 return Shift;
8630
8631 // There are special ways we can lower some single-element blends.
8632 if (NumV2Elements == 1)
8633 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8634 Mask, Subtarget, DAG))
8635 return V;
8636
8637 // Use dedicated unpack instructions for masks that match their pattern.
8638 if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8639 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8640 if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8641 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8642
8643 if (Subtarget->hasSSE41())
8644 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8645 Subtarget, DAG))
8646 return Blend;
8647
8648 // Try to use byte rotation instructions.
8649 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8650 if (Subtarget->hasSSSE3())
8651 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8652 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8653 return Rotate;
8654
8655 // We implement this with SHUFPS because it can blend from two vectors.
8656 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8657 // up the inputs, bypassing domain shift penalties that we would encur if we
8658 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8659 // relevant.
8660 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8661 DAG.getVectorShuffle(
8662 MVT::v4f32, DL,
8663 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8664 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8665}
8666
8667/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8668/// shuffle lowering, and the most complex part.
8669///
8670/// The lowering strategy is to try to form pairs of input lanes which are
8671/// targeted at the same half of the final vector, and then use a dword shuffle
8672/// to place them onto the right half, and finally unpack the paired lanes into
8673/// their final position.
8674///
8675/// The exact breakdown of how to form these dword pairs and align them on the
8676/// correct sides is really tricky. See the comments within the function for
8677/// more of the details.
8678static SDValue lowerV8I16SingleInputVectorShuffle(
8679 SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8680 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8681 assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8681, __PRETTY_FUNCTION__))
;
8682 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8683 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8684
8685 SmallVector<int, 4> LoInputs;
8686 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8687 [](int M) { return M >= 0; });
8688 std::sort(LoInputs.begin(), LoInputs.end());
8689 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8690 SmallVector<int, 4> HiInputs;
8691 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8692 [](int M) { return M >= 0; });
8693 std::sort(HiInputs.begin(), HiInputs.end());
8694 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8695 int NumLToL =
8696 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8697 int NumHToL = LoInputs.size() - NumLToL;
8698 int NumLToH =
8699 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8700 int NumHToH = HiInputs.size() - NumLToH;
8701 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8702 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8703 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8704 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8705
8706 // Check for being able to broadcast a single element.
8707 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8708 Mask, Subtarget, DAG))
8709 return Broadcast;
8710
8711 // Try to use byte shift instructions.
8712 if (SDValue Shift = lowerVectorShuffleAsByteShift(
8713 DL, MVT::v8i16, V, V, Mask, DAG))
8714 return Shift;
8715
8716 // Use dedicated unpack instructions for masks that match their pattern.
8717 if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8718 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8719 if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8720 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8721
8722 // Try to use byte rotation instructions.
8723 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8724 DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8725 return Rotate;
8726
8727 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8728 // such inputs we can swap two of the dwords across the half mark and end up
8729 // with <=2 inputs to each half in each half. Once there, we can fall through
8730 // to the generic code below. For example:
8731 //
8732 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8733 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8734 //
8735 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8736 // and an existing 2-into-2 on the other half. In this case we may have to
8737 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8738 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8739 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8740 // because any other situation (including a 3-into-1 or 1-into-3 in the other
8741 // half than the one we target for fixing) will be fixed when we re-enter this
8742 // path. We will also combine away any sequence of PSHUFD instructions that
8743 // result into a single instruction. Here is an example of the tricky case:
8744 //
8745 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8746 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8747 //
8748 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8749 //
8750 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8751 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8752 //
8753 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8754 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8755 //
8756 // The result is fine to be handled by the generic logic.
8757 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8758 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8759 int AOffset, int BOffset) {
8760 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8761, __PRETTY_FUNCTION__))
8761 "Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8761, __PRETTY_FUNCTION__))
;
8762 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8763, __PRETTY_FUNCTION__))
8763 "Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8763, __PRETTY_FUNCTION__))
;
8764 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8765, __PRETTY_FUNCTION__))
8765 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8765, __PRETTY_FUNCTION__))
;
8766
8767 // Compute the index of dword with only one word among the three inputs in
8768 // a half by taking the sum of the half with three inputs and subtracting
8769 // the sum of the actual three inputs. The difference is the remaining
8770 // slot.
8771 int ADWord, BDWord;
8772 int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8773 int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8774 int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8775 ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8776 int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8777 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8778 int TripleNonInputIdx =
8779 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8780 TripleDWord = TripleNonInputIdx / 2;
8781
8782 // We use xor with one to compute the adjacent DWord to whichever one the
8783 // OneInput is in.
8784 OneInputDWord = (OneInput / 2) ^ 1;
8785
8786 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8787 // and BToA inputs. If there is also such a problem with the BToB and AToB
8788 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8789 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8790 // is essential that we don't *create* a 3<-1 as then we might oscillate.
8791 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8792 // Compute how many inputs will be flipped by swapping these DWords. We
8793 // need
8794 // to balance this to ensure we don't form a 3-1 shuffle in the other
8795 // half.
8796 int NumFlippedAToBInputs =
8797 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8798 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8799 int NumFlippedBToBInputs =
8800 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8801 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8802 if ((NumFlippedAToBInputs == 1 &&
8803 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8804 (NumFlippedBToBInputs == 1 &&
8805 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8806 // We choose whether to fix the A half or B half based on whether that
8807 // half has zero flipped inputs. At zero, we may not be able to fix it
8808 // with that half. We also bias towards fixing the B half because that
8809 // will more commonly be the high half, and we have to bias one way.
8810 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8811 ArrayRef<int> Inputs) {
8812 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8813 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8814 PinnedIdx ^ 1) != Inputs.end();
8815 // Determine whether the free index is in the flipped dword or the
8816 // unflipped dword based on where the pinned index is. We use this bit
8817 // in an xor to conditionally select the adjacent dword.
8818 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8819 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8820 FixFreeIdx) != Inputs.end();
8821 if (IsFixIdxInput == IsFixFreeIdxInput)
8822 FixFreeIdx += 1;
8823 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8824 FixFreeIdx) != Inputs.end();
8825 assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8826, __PRETTY_FUNCTION__))
8826 "We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8826, __PRETTY_FUNCTION__))
;
8827 int PSHUFHalfMask[] = {0, 1, 2, 3};
8828 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8829 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8830 MVT::v8i16, V,
8831 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8832
8833 for (int &M : Mask)
8834 if (M != -1 && M == FixIdx)
8835 M = FixFreeIdx;
8836 else if (M != -1 && M == FixFreeIdx)
8837 M = FixIdx;
8838 };
8839 if (NumFlippedBToBInputs != 0) {
8840 int BPinnedIdx =
8841 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8842 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8843 } else {
8844 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8844, __PRETTY_FUNCTION__))
;
8845 int APinnedIdx =
8846 AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8847 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8848 }
8849 }
8850 }
8851
8852 int PSHUFDMask[] = {0, 1, 2, 3};
8853 PSHUFDMask[ADWord] = BDWord;
8854 PSHUFDMask[BDWord] = ADWord;
8855 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
8856 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8857 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
8858 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8859
8860 // Adjust the mask to match the new locations of A and B.
8861 for (int &M : Mask)
8862 if (M != -1 && M/2 == ADWord)
8863 M = 2 * BDWord + M % 2;
8864 else if (M != -1 && M/2 == BDWord)
8865 M = 2 * ADWord + M % 2;
8866
8867 // Recurse back into this routine to re-compute state now that this isn't
8868 // a 3 and 1 problem.
8869 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
8870 Mask);
8871 };
8872 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8873 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8874 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8875 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8876
8877 // At this point there are at most two inputs to the low and high halves from
8878 // each half. That means the inputs can always be grouped into dwords and
8879 // those dwords can then be moved to the correct half with a dword shuffle.
8880 // We use at most one low and one high word shuffle to collect these paired
8881 // inputs into dwords, and finally a dword shuffle to place them.
8882 int PSHUFLMask[4] = {-1, -1, -1, -1};
8883 int PSHUFHMask[4] = {-1, -1, -1, -1};
8884 int PSHUFDMask[4] = {-1, -1, -1, -1};
8885
8886 // First fix the masks for all the inputs that are staying in their
8887 // original halves. This will then dictate the targets of the cross-half
8888 // shuffles.
8889 auto fixInPlaceInputs =
8890 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8891 MutableArrayRef<int> SourceHalfMask,
8892 MutableArrayRef<int> HalfMask, int HalfOffset) {
8893 if (InPlaceInputs.empty())
8894 return;
8895 if (InPlaceInputs.size() == 1) {
8896 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8897 InPlaceInputs[0] - HalfOffset;
8898 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8899 return;
8900 }
8901 if (IncomingInputs.empty()) {
8902 // Just fix all of the in place inputs.
8903 for (int Input : InPlaceInputs) {
8904 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8905 PSHUFDMask[Input / 2] = Input / 2;
8906 }
8907 return;
8908 }
8909
8910 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8910, __PRETTY_FUNCTION__))
;
8911 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8912 InPlaceInputs[0] - HalfOffset;
8913 // Put the second input next to the first so that they are packed into
8914 // a dword. We find the adjacent index by toggling the low bit.
8915 int AdjIndex = InPlaceInputs[0] ^ 1;
8916 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8917 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8918 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8919 };
8920 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8921 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8922
8923 // Now gather the cross-half inputs and place them into a free dword of
8924 // their target half.
8925 // FIXME: This operation could almost certainly be simplified dramatically to
8926 // look more like the 3-1 fixing operation.
8927 auto moveInputsToRightHalf = [&PSHUFDMask](
8928 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8929 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8930 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8931 int DestOffset) {
8932 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8933 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8934 };
8935 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8936 int Word) {
8937 int LowWord = Word & ~1;
8938 int HighWord = Word | 1;
8939 return isWordClobbered(SourceHalfMask, LowWord) ||
8940 isWordClobbered(SourceHalfMask, HighWord);
8941 };
8942
8943 if (IncomingInputs.empty())
8944 return;
8945
8946 if (ExistingInputs.empty()) {
8947 // Map any dwords with inputs from them into the right half.
8948 for (int Input : IncomingInputs) {
8949 // If the source half mask maps over the inputs, turn those into
8950 // swaps and use the swapped lane.
8951 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8952 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8953 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8954 Input - SourceOffset;
8955 // We have to swap the uses in our half mask in one sweep.
8956 for (int &M : HalfMask)
8957 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8958 M = Input;
8959 else if (M == Input)
8960 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8961 } else {
8962 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__))
8963 Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__))
8964 "Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8964, __PRETTY_FUNCTION__))
;
8965 }
8966 // Note that this correctly re-maps both when we do a swap and when
8967 // we observe the other side of the swap above. We rely on that to
8968 // avoid swapping the members of the input list directly.
8969 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8970 }
8971
8972 // Map the input's dword into the correct half.
8973 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
8974 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
8975 else
8976 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__))
8977 Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__))
8978 "Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8978, __PRETTY_FUNCTION__))
;
8979 }
8980
8981 // And just directly shift any other-half mask elements to be same-half
8982 // as we will have mirrored the dword containing the element into the
8983 // same position within that half.
8984 for (int &M : HalfMask)
8985 if (M >= SourceOffset && M < SourceOffset + 4) {
8986 M = M - SourceOffset + DestOffset;
8987 assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 8987, __PRETTY_FUNCTION__))
;
8988 }
8989 return;
8990 }
8991
8992 // Ensure we have the input in a viable dword of its current half. This
8993 // is particularly tricky because the original position may be clobbered
8994 // by inputs being moved and *staying* in that half.
8995 if (IncomingInputs.size() == 1) {
8996 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8997 int InputFixed = std::find(std::begin(SourceHalfMask),
8998 std::end(SourceHalfMask), -1) -
8999 std::begin(SourceHalfMask) + SourceOffset;
9000 SourceHalfMask[InputFixed - SourceOffset] =
9001 IncomingInputs[0] - SourceOffset;
9002 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9003 InputFixed);
9004 IncomingInputs[0] = InputFixed;
9005 }
9006 } else if (IncomingInputs.size() == 2) {
9007 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9008 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9009 // We have two non-adjacent or clobbered inputs we need to extract from
9010 // the source half. To do this, we need to map them into some adjacent
9011 // dword slot in the source mask.
9012 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9013 IncomingInputs[1] - SourceOffset};
9014
9015 // If there is a free slot in the source half mask adjacent to one of
9016 // the inputs, place the other input in it. We use (Index XOR 1) to
9017 // compute an adjacent index.
9018 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9019 SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9020 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9021 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9022 InputsFixed[1] = InputsFixed[0] ^ 1;
9023 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9024 SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9025 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9026 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9027 InputsFixed[0] = InputsFixed[1] ^ 1;
9028 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9029 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9030 // The two inputs are in the same DWord but it is clobbered and the
9031 // adjacent DWord isn't used at all. Move both inputs to the free
9032 // slot.
9033 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9034 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9035 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9036 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9037 } else {
9038 // The only way we hit this point is if there is no clobbering
9039 // (because there are no off-half inputs to this half) and there is no
9040 // free slot adjacent to one of the inputs. In this case, we have to
9041 // swap an input with a non-input.
9042 for (int i = 0; i < 4; ++i)
9043 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9044, __PRETTY_FUNCTION__))
9044 "We can't handle any clobbers here!")(((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9044, __PRETTY_FUNCTION__))
;
9045 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __PRETTY_FUNCTION__))
9046 "Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __PRETTY_FUNCTION__))
;
9047
9048 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9049 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9050
9051 // We also have to update the final source mask in this case because
9052 // it may need to undo the above swap.
9053 for (int &M : FinalSourceHalfMask)
9054 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9055 M = InputsFixed[1] + SourceOffset;
9056 else if (M == InputsFixed[1] + SourceOffset)
9057 M = (InputsFixed[0] ^ 1) + SourceOffset;
9058
9059 InputsFixed[1] = InputsFixed[0] ^ 1;
9060 }
9061
9062 // Point everything at the fixed inputs.
9063 for (int &M : HalfMask)
9064 if (M == IncomingInputs[0])
9065 M = InputsFixed[0] + SourceOffset;
9066 else if (M == IncomingInputs[1])
9067 M = InputsFixed[1] + SourceOffset;
9068
9069 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9070 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9071 }
9072 } else {
9073 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9073)
;
9074 }
9075
9076 // Now hoist the DWord down to the right half.
9077 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9078 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free")((PSHUFDMask[FreeDWord] == -1 && "DWord not free") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] == -1 && \"DWord not free\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9078, __PRETTY_FUNCTION__))
;
9079 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9080 for (int &M : HalfMask)
9081 for (int Input : IncomingInputs)
9082 if (M == Input)
9083 M = FreeDWord * 2 + Input % 2;
9084 };
9085 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9086 /*SourceOffset*/ 4, /*DestOffset*/ 0);
9087 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9088 /*SourceOffset*/ 0, /*DestOffset*/ 4);
9089
9090 // Now enact all the shuffles we've computed to move the inputs into their
9091 // target half.
9092 if (!isNoopShuffleMask(PSHUFLMask))
9093 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9094 getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9095 if (!isNoopShuffleMask(PSHUFHMask))
9096 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9097 getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9098 if (!isNoopShuffleMask(PSHUFDMask))
9099 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9100 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9101 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9102 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9103
9104 // At this point, each half should contain all its inputs, and we can then
9105 // just shuffle them into their final position.
9106 assert(std::count_if(LoMask.begin(), LoMask.end(),((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__))
9107 [](int M) { return M >= 4; }) == 0 &&((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__))
9108 "Failed to lift all the high half inputs to the low mask!")((std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9108, __PRETTY_FUNCTION__))
;
9109 assert(std::count_if(HiMask.begin(), HiMask.end(),((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__))
9110 [](int M) { return M >= 0 && M < 4; }) == 0 &&((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__))
9111 "Failed to lift all the low half inputs to the high mask!")((std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9111, __PRETTY_FUNCTION__))
;
9112
9113 // Do a half shuffle for the low mask.
9114 if (!isNoopShuffleMask(LoMask))
9115 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9116 getV4X86ShuffleImm8ForMask(LoMask, DAG));
9117
9118 // Do a half shuffle with the high mask after shifting its values down.
9119 for (int &M : HiMask)
9120 if (M >= 0)
9121 M -= 4;
9122 if (!isNoopShuffleMask(HiMask))
9123 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9124 getV4X86ShuffleImm8ForMask(HiMask, DAG));
9125
9126 return V;
9127}
9128
9129/// \brief Detect whether the mask pattern should be lowered through
9130/// interleaving.
9131///
9132/// This essentially tests whether viewing the mask as an interleaving of two
9133/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9134/// lowering it through interleaving is a significantly better strategy.
9135static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9136 int NumEvenInputs[2] = {0, 0};
9137 int NumOddInputs[2] = {0, 0};
9138 int NumLoInputs[2] = {0, 0};
9139 int NumHiInputs[2] = {0, 0};
9140 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9141 if (Mask[i] < 0)
9142 continue;
9143
9144 int InputIdx = Mask[i] >= Size;
9145
9146 if (i < Size / 2)
9147 ++NumLoInputs[InputIdx];
9148 else
9149 ++NumHiInputs[InputIdx];
9150
9151 if ((i % 2) == 0)
9152 ++NumEvenInputs[InputIdx];
9153 else
9154 ++NumOddInputs[InputIdx];
9155 }
9156
9157 // The minimum number of cross-input results for both the interleaved and
9158 // split cases. If interleaving results in fewer cross-input results, return
9159 // true.
9160 int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9161 NumEvenInputs[0] + NumOddInputs[1]);
9162 int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9163 NumLoInputs[0] + NumHiInputs[1]);
9164 return InterleavedCrosses < SplitCrosses;
9165}
9166
9167/// \brief Blend two v8i16 vectors using a naive unpack strategy.
9168///
9169/// This strategy only works when the inputs from each vector fit into a single
9170/// half of that vector, and generally there are not so many inputs as to leave
9171/// the in-place shuffles required highly constrained (and thus expensive). It
9172/// shifts all the inputs into a single side of both input vectors and then
9173/// uses an unpack to interleave these inputs in a single vector. At that
9174/// point, we will fall back on the generic single input shuffle lowering.
9175static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9176 SDValue V2,
9177 MutableArrayRef<int> Mask,
9178 const X86Subtarget *Subtarget,
9179 SelectionDAG &DAG) {
9180 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9180, __PRETTY_FUNCTION__))
;
9181 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad input type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9181, __PRETTY_FUNCTION__))
;
9182 SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9183 for (int i = 0; i < 8; ++i)
9184 if (Mask[i] >= 0 && Mask[i] < 4)
9185 LoV1Inputs.push_back(i);
9186 else if (Mask[i] >= 4 && Mask[i] < 8)
9187 HiV1Inputs.push_back(i);
9188 else if (Mask[i] >= 8 && Mask[i] < 12)
9189 LoV2Inputs.push_back(i);
9190 else if (Mask[i] >= 12)
9191 HiV2Inputs.push_back(i);
9192
9193 int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9194 int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9195 (void)NumV1Inputs;
9196 (void)NumV2Inputs;
9197 assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported")((NumV1Inputs > 0 && NumV1Inputs <= 3 &&
"At most 3 inputs supported") ? static_cast<void> (0) :
__assert_fail ("NumV1Inputs > 0 && NumV1Inputs <= 3 && \"At most 3 inputs supported\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9197, __PRETTY_FUNCTION__))
;
9198 assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported")((NumV2Inputs > 0 && NumV2Inputs <= 3 &&
"At most 3 inputs supported") ? static_cast<void> (0) :
__assert_fail ("NumV2Inputs > 0 && NumV2Inputs <= 3 && \"At most 3 inputs supported\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9198, __PRETTY_FUNCTION__))
;
9199 assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs")((NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"
) ? static_cast<void> (0) : __assert_fail ("NumV1Inputs + NumV2Inputs <= 4 && \"At most 4 combined inputs\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9199, __PRETTY_FUNCTION__))
;
9200
9201 bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9202 HiV1Inputs.size() + HiV2Inputs.size();
9203
9204 auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9205 ArrayRef<int> HiInputs, bool MoveToLo,
9206 int MaskOffset) {
9207 ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9208 ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9209 if (BadInputs.empty())
9210 return V;
9211
9212 int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9213 int MoveOffset = MoveToLo ? 0 : 4;
9214
9215 if (GoodInputs.empty()) {
9216 for (int BadInput : BadInputs) {
9217 MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9218 Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9219 }
9220 } else {
9221 if (GoodInputs.size() == 2) {
9222 // If the low inputs are spread across two dwords, pack them into
9223 // a single dword.
9224 MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9225 MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9226 Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9227 Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9228 } else {
9229 // Otherwise pin the good inputs.
9230 for (int GoodInput : GoodInputs)
9231 MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9232 }
9233
9234 if (BadInputs.size() == 2) {
9235 // If we have two bad inputs then there may be either one or two good
9236 // inputs fixed in place. Find a fixed input, and then find the *other*
9237 // two adjacent indices by using modular arithmetic.
9238 int GoodMaskIdx =
9239 std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9240 [](int M) { return M >= 0; }) -
9241 std::begin(MoveMask);
9242 int MoveMaskIdx =
9243 ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9244 assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot")((MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"
) ? static_cast<void> (0) : __assert_fail ("MoveMask[MoveMaskIdx] == -1 && \"Expected empty slot\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9244, __PRETTY_FUNCTION__))
;
9245 assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot")((MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"
) ? static_cast<void> (0) : __assert_fail ("MoveMask[MoveMaskIdx + 1] == -1 && \"Expected empty slot\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9245, __PRETTY_FUNCTION__))
;
9246 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9247 MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9248 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9249 Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9250 } else {
9251 assert(BadInputs.size() == 1 && "All sizes handled")((BadInputs.size() == 1 && "All sizes handled") ? static_cast
<void> (0) : __assert_fail ("BadInputs.size() == 1 && \"All sizes handled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9251, __PRETTY_FUNCTION__))
;
9252 int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9253 std::end(MoveMask), -1) -
9254 std::begin(MoveMask);
9255 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9256 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9257 }
9258 }
9259
9260 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9261 MoveMask);
9262 };
9263 V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9264 /*MaskOffset*/ 0);
9265 V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9266 /*MaskOffset*/ 8);
9267
9268 // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9269 // cross-half traffic in the final shuffle.
9270
9271 // Munge the mask to be a single-input mask after the unpack merges the
9272 // results.
9273 for (int &M : Mask)
9274 if (M != -1)
9275 M = 2 * (M % 4) + (M / 8);
9276
9277 return DAG.getVectorShuffle(
9278 MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9279 DL, MVT::v8i16, V1, V2),
9280 DAG.getUNDEF(MVT::v8i16), Mask);
9281}
9282
9283/// \brief Generic lowering of 8-lane i16 shuffles.
9284///
9285/// This handles both single-input shuffles and combined shuffle/blends with
9286/// two inputs. The single input shuffles are immediately delegated to
9287/// a dedicated lowering routine.
9288///
9289/// The blends are lowered in one of three fundamental ways. If there are few
9290/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9291/// of the input is significantly cheaper when lowered as an interleaving of
9292/// the two inputs, try to interleave them. Otherwise, blend the low and high
9293/// halves of the inputs separately (making them have relatively few inputs)
9294/// and then concatenate them.
9295static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9296 const X86Subtarget *Subtarget,
9297 SelectionDAG &DAG) {
9298 SDLoc DL(Op);
9299 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v8i16 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9299, __PRETTY_FUNCTION__))
;
9300 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9300, __PRETTY_FUNCTION__))
;
9301 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9301, __PRETTY_FUNCTION__))
;
9302 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9303 ArrayRef<int> OrigMask = SVOp->getMask();
9304 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9305 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9306 MutableArrayRef<int> Mask(MaskStorage);
9307
9308 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9308, __PRETTY_FUNCTION__))
;
9309
9310 // Whenever we can lower this as a zext, that instruction is strictly faster
9311 // than any alternative.
9312 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9313 DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9314 return ZExt;
9315
9316 auto isV1 = [](int M) { return M >= 0 && M < 8; };
9317 auto isV2 = [](int M) { return M >= 8; };
9318
9319 int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9320 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9321
9322 if (NumV2Inputs == 0)
9323 return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9324
9325 assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "((NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.") ? static_cast<void> (0) : __assert_fail
("NumV1Inputs > 0 && \"All single-input shuffles should be canonicalized \" \"to be V1-input shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9326, __PRETTY_FUNCTION__))
9326 "to be V1-input shuffles.")((NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.") ? static_cast<void> (0) : __assert_fail
("NumV1Inputs > 0 && \"All single-input shuffles should be canonicalized \" \"to be V1-input shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9326, __PRETTY_FUNCTION__))
;
9327
9328 // Try to use byte shift instructions.
9329 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9330 DL, MVT::v8i16, V1, V2, Mask, DAG))
9331 return Shift;
9332
9333 // There are special ways we can lower some single-element blends.
9334 if (NumV2Inputs == 1)
9335 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9336 Mask, Subtarget, DAG))
9337 return V;
9338
9339 // Use dedicated unpack instructions for masks that match their pattern.
9340 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9341 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9342 if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9343 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9344
9345 if (Subtarget->hasSSE41())
9346 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9347 Subtarget, DAG))
9348 return Blend;
9349
9350 // Try to use byte rotation instructions.
9351 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9352 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9353 return Rotate;
9354
9355 if (NumV1Inputs + NumV2Inputs <= 4)
9356 return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9357
9358 // Check whether an interleaving lowering is likely to be more efficient.
9359 // This isn't perfect but it is a strong heuristic that tends to work well on
9360 // the kinds of shuffles that show up in practice.
9361 //
9362 // FIXME: Handle 1x, 2x, and 4x interleaving.
9363 if (shouldLowerAsInterleaving(Mask)) {
9364 // FIXME: Figure out whether we should pack these into the low or high
9365 // halves.
9366
9367 int EMask[8], OMask[8];
9368 for (int i = 0; i < 4; ++i) {
9369 EMask[i] = Mask[2*i];
9370 OMask[i] = Mask[2*i + 1];
9371 EMask[i + 4] = -1;
9372 OMask[i + 4] = -1;
9373 }
9374
9375 SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9376 SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9377
9378 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9379 }
9380
9381 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9382 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9383
9384 for (int i = 0; i < 4; ++i) {
9385 LoBlendMask[i] = Mask[i];
9386 HiBlendMask[i] = Mask[i + 4];
9387 }
9388
9389 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9390 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9391 LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9392 HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9393
9394 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9395 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9396}
9397
9398/// \brief Check whether a compaction lowering can be done by dropping even
9399/// elements and compute how many times even elements must be dropped.
9400///
9401/// This handles shuffles which take every Nth element where N is a power of
9402/// two. Example shuffle masks:
9403///
9404/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
9405/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9406/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
9407/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
9408/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
9409/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
9410///
9411/// Any of these lanes can of course be undef.
9412///
9413/// This routine only supports N <= 3.
9414/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9415/// for larger N.
9416///
9417/// \returns N above, or the number of times even elements must be dropped if
9418/// there is such a number. Otherwise returns zero.
9419static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9420 // Figure out whether we're looping over two inputs or just one.
9421 bool IsSingleInput = isSingleInputShuffleMask(Mask);
9422
9423 // The modulus for the shuffle vector entries is based on whether this is
9424 // a single input or not.
9425 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9426 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9427, __PRETTY_FUNCTION__))
9427 "We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9427, __PRETTY_FUNCTION__))
;
9428
9429 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9430
9431 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9432 // and 2^3 simultaneously. This is because we may have ambiguity with
9433 // partially undef inputs.
9434 bool ViableForN[3] = {true, true, true};
9435
9436 for (int i = 0, e = Mask.size(); i < e; ++i) {
9437 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9438 // want.
9439 if (Mask[i] == -1)
9440 continue;
9441
9442 bool IsAnyViable = false;
9443 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9444 if (ViableForN[j]) {
9445 uint64_t N = j + 1;
9446
9447 // The shuffle mask must be equal to (i * 2^N) % M.
9448 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9449 IsAnyViable = true;
9450 else
9451 ViableForN[j] = false;
9452 }
9453 // Early exit if we exhaust the possible powers of two.
9454 if (!IsAnyViable)
9455 break;
9456 }
9457
9458 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9459 if (ViableForN[j])
9460 return j + 1;
9461
9462 // Return 0 as there is no viable power of two.
9463 return 0;
9464}
9465
9466/// \brief Generic lowering of v16i8 shuffles.
9467///
9468/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9469/// detect any complexity reducing interleaving. If that doesn't help, it uses
9470/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9471/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9472/// back together.
9473static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9474 const X86Subtarget *Subtarget,
9475 SelectionDAG &DAG) {
9476 SDLoc DL(Op);
9477 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!")((Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType() == MVT::v16i8 && \"Bad shuffle type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9477, __PRETTY_FUNCTION__))
;
9478 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9478, __PRETTY_FUNCTION__))
;
9479 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9479, __PRETTY_FUNCTION__))
;
9480 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9481 ArrayRef<int> OrigMask = SVOp->getMask();
9482 assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!")((OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("OrigMask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __PRETTY_FUNCTION__))
;
9483
9484 // Try to use byte shift instructions.
9485 if (SDValue Shift = lowerVectorShuffleAsByteShift(
9486 DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9487 return Shift;
9488
9489 // Try to use byte rotation instructions.
9490 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9491 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9492 return Rotate;
9493
9494 // Try to use a zext lowering.
9495 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9496 DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9497 return ZExt;
9498
9499 int MaskStorage[16] = {
9500 OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9501 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
9502 OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
9503 OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9504 MutableArrayRef<int> Mask(MaskStorage);
9505 MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9506 MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9507
9508 int NumV2Elements =
9509 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9510
9511 // For single-input shuffles, there are some nicer lowering tricks we can use.
9512 if (NumV2Elements == 0) {
9513 // Check for being able to broadcast a single element.
9514 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9515 Mask, Subtarget, DAG))
9516 return Broadcast;
9517
9518 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9519 // Notably, this handles splat and partial-splat shuffles more efficiently.
9520 // However, it only makes sense if the pre-duplication shuffle simplifies
9521 // things significantly. Currently, this means we need to be able to
9522 // express the pre-duplication shuffle as an i16 shuffle.
9523 //
9524 // FIXME: We should check for other patterns which can be widened into an
9525 // i16 shuffle as well.
9526 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9527 for (int i = 0; i < 16; i += 2)
9528 if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9529 return false;
9530
9531 return true;
9532 };
9533 auto tryToWidenViaDuplication = [&]() -> SDValue {
9534 if (!canWidenViaDuplication(Mask))
9535 return SDValue();
9536 SmallVector<int, 4> LoInputs;
9537 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9538 [](int M) { return M >= 0 && M < 8; });
9539 std::sort(LoInputs.begin(), LoInputs.end());
9540 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9541 LoInputs.end());
9542 SmallVector<int, 4> HiInputs;
9543 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9544 [](int M) { return M >= 8; });
9545 std::sort(HiInputs.begin(), HiInputs.end());
9546 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9547 HiInputs.end());
9548
9549 bool TargetLo = LoInputs.size() >= HiInputs.size();
9550 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9551 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9552
9553 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9554 SmallDenseMap<int, int, 8> LaneMap;
9555 for (int I : InPlaceInputs) {
9556 PreDupI16Shuffle[I/2] = I/2;
9557 LaneMap[I] = I;
9558 }
9559 int j = TargetLo ? 0 : 4, je = j + 4;
9560 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9561 // Check if j is already a shuffle of this input. This happens when
9562 // there are two adjacent bytes after we move the low one.
9563 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9564 // If we haven't yet mapped the input, search for a slot into which
9565 // we can map it.
9566 while (j < je && PreDupI16Shuffle[j] != -1)
9567 ++j;
9568
9569 if (j == je)
9570 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9571 return SDValue();
9572
9573 // Map this input with the i16 shuffle.
9574 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9575 }
9576
9577 // Update the lane map based on the mapping we ended up with.
9578 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9579 }
9580 V1 = DAG.getNode(
9581 ISD::BITCAST, DL, MVT::v16i8,
9582 DAG.getVectorShuffle(MVT::v8i16, DL,
9583 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9584 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9585
9586 // Unpack the bytes to form the i16s that will be shuffled into place.
9587 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9588 MVT::v16i8, V1, V1);
9589
9590 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9591 for (int i = 0; i < 16; ++i)
9592 if (Mask[i] != -1) {
9593 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9594 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9594, __PRETTY_FUNCTION__))
;
9595 if (PostDupI16Shuffle[i / 2] == -1)
9596 PostDupI16Shuffle[i / 2] = MappedMask;
9597 else
9598 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entrties in the original shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9599, __PRETTY_FUNCTION__))
9599 "Conflicting entrties in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entrties in the original shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9599, __PRETTY_FUNCTION__))
;
9600 }
9601 return DAG.getNode(
9602 ISD::BITCAST, DL, MVT::v16i8,
9603 DAG.getVectorShuffle(MVT::v8i16, DL,
9604 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9605 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9606 };
9607 if (SDValue V = tryToWidenViaDuplication())
9608 return V;
9609 }
9610
9611 // Check whether an interleaving lowering is likely to be more efficient.
9612 // This isn't perfect but it is a strong heuristic that tends to work well on
9613 // the kinds of shuffles that show up in practice.
9614 //
9615 // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9616 if (shouldLowerAsInterleaving(Mask)) {
9617 int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9618 return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9619 });
9620 int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9621 return (M >= 8 && M < 16) || M >= 24;
9622 });
9623 int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9624 -1, -1, -1, -1, -1, -1, -1, -1};
9625 int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9626 -1, -1, -1, -1, -1, -1, -1, -1};
9627 bool UnpackLo = NumLoHalf >= NumHiHalf;
9628 MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9629 MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9630 for (int i = 0; i < 8; ++i) {
9631 TargetEMask[i] = Mask[2 * i];
9632 TargetOMask[i] = Mask[2 * i + 1];
9633 }
9634
9635 SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9636 SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9637
9638 return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9639 MVT::v16i8, Evens, Odds);
9640 }
9641
9642 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9643 // with PSHUFB. It is important to do this before we attempt to generate any
9644 // blends but after all of the single-input lowerings. If the single input
9645 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9646 // want to preserve that and we can DAG combine any longer sequences into
9647 // a PSHUFB in the end. But once we start blending from multiple inputs,
9648 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9649 // and there are *very* few patterns that would actually be faster than the
9650 // PSHUFB approach because of its ability to zero lanes.
9651 //
9652 // FIXME: The only exceptions to the above are blends which are exact
9653 // interleavings with direct instructions supporting them. We currently don't
9654 // handle those well here.
9655 if (Subtarget->hasSSSE3()) {
9656 SDValue V1Mask[16];
9657 SDValue V2Mask[16];
9658 bool V1InUse = false;
9659 bool V2InUse = false;
9660 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9661
9662 for (int i = 0; i < 16; ++i) {
9663 if (Mask[i] == -1) {
9664 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9665 } else {
9666 const int ZeroMask = 0x80;
9667 int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9668 int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9669 if (Zeroable[i])
9670 V1Idx = V2Idx = ZeroMask;
9671 V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9672 V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9673 V1InUse |= (ZeroMask != V1Idx);
9674 V2InUse |= (ZeroMask != V2Idx);
9675 }
9676 }
9677
9678 if (V1InUse)
9679 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9680 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9681 if (V2InUse)
9682 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9683 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9684
9685 // If we need shuffled inputs from both, blend the two.
9686 if (V1InUse && V2InUse)
9687 return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9688 if (V1InUse)
9689 return V1; // Single inputs are easy.
9690 if (V2InUse)
9691 return V2; // Single inputs are easy.
9692 // Shuffling to a zeroable vector.
9693 return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9694 }
9695
9696 // There are special ways we can lower some single-element blends.
9697 if (NumV2Elements == 1)
9698 if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9699 Mask, Subtarget, DAG))
9700 return V;
9701
9702 // Check whether a compaction lowering can be done. This handles shuffles
9703 // which take every Nth element for some even N. See the helper function for
9704 // details.
9705 //
9706 // We special case these as they can be particularly efficiently handled with
9707 // the PACKUSB instruction on x86 and they show up in common patterns of
9708 // rearranging bytes to truncate wide elements.
9709 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9710 // NumEvenDrops is the power of two stride of the elements. Another way of
9711 // thinking about it is that we need to drop the even elements this many
9712 // times to get the original input.
9713 bool IsSingleInput = isSingleInputShuffleMask(Mask);
9714
9715 // First we need to zero all the dropped bytes.
9716 assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9717, __PRETTY_FUNCTION__))
9717 "No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9717, __PRETTY_FUNCTION__))
;
9718 // We use the mask type to pick which bytes are preserved based on how many
9719 // elements are dropped.
9720 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9721 SDValue ByteClearMask =
9722 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9723 DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9724 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9725 if (!IsSingleInput)
9726 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9727
9728 // Now pack things back together.
9729 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9730 V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9731 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9732 for (int i = 1; i < NumEvenDrops; ++i) {
9733 Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9734 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9735 }
9736
9737 return Result;
9738 }
9739
9740 int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9741 int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9742 int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9743 int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9744
9745 auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9746 MutableArrayRef<int> V1HalfBlendMask,
9747 MutableArrayRef<int> V2HalfBlendMask) {
9748 for (int i = 0; i < 8; ++i)
9749 if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9750 V1HalfBlendMask[i] = HalfMask[i];
9751 HalfMask[i] = i;
9752 } else if (HalfMask[i] >= 16) {
9753 V2HalfBlendMask[i] = HalfMask[i] - 16;
9754 HalfMask[i] = i + 8;
9755 }
9756 };
9757 buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9758 buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9759
9760 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9761
9762 auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9763 MutableArrayRef<int> HiBlendMask) {
9764 SDValue V1, V2;
9765 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9766 // them out and avoid using UNPCK{L,H} to extract the elements of V as
9767 // i16s.
9768 if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9769 [](int M) { return M >= 0 && M % 2 == 1; }) &&
9770 std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9771 [](int M) { return M >= 0 && M % 2 == 1; })) {
9772 // Use a mask to drop the high bytes.
9773 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9774 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9775 DAG.getConstant(0x00FF, MVT::v8i16));
9776
9777 // This will be a single vector shuffle instead of a blend so nuke V2.
9778 V2 = DAG.getUNDEF(MVT::v8i16);
9779
9780 // Squash the masks to point directly into V1.
9781 for (int &M : LoBlendMask)
9782 if (M >= 0)
9783 M /= 2;
9784 for (int &M : HiBlendMask)
9785 if (M >= 0)
9786 M /= 2;
9787 } else {
9788 // Otherwise just unpack the low half of V into V1 and the high half into
9789 // V2 so that we can blend them as i16s.
9790 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9791 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9792 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9793 DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9794 }
9795
9796 SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9797 SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9798 return std::make_pair(BlendedLo, BlendedHi);
9799 };
9800 SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9801 std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9802 std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9803
9804 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9805 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9806
9807 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9808}
9809
9810/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9811///
9812/// This routine breaks down the specific type of 128-bit shuffle and
9813/// dispatches to the lowering routines accordingly.
9814static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9815 MVT VT, const X86Subtarget *Subtarget,
9816 SelectionDAG &DAG) {
9817 switch (VT.SimpleTy) {
9818 case MVT::v2i64:
9819 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9820 case MVT::v2f64:
9821 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9822 case MVT::v4i32:
9823 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9824 case MVT::v4f32:
9825 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9826 case MVT::v8i16:
9827 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
9828 case MVT::v16i8:
9829 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9830
9831 default:
9832 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9832)
;
9833 }
9834}
9835
9836/// \brief Helper function to test whether a shuffle mask could be
9837/// simplified by widening the elements being shuffled.
9838///
9839/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9840/// leaves it in an unspecified state.
9841///
9842/// NOTE: This must handle normal vector shuffle masks and *target* vector
9843/// shuffle masks. The latter have the special property of a '-2' representing
9844/// a zero-ed lane of a vector.
9845static bool canWidenShuffleElements(ArrayRef<int> Mask,
9846 SmallVectorImpl<int> &WidenedMask) {
9847 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9848 // If both elements are undef, its trivial.
9849 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9850 WidenedMask.push_back(SM_SentinelUndef);
9851 continue;
9852 }
9853
9854 // Check for an undef mask and a mask value properly aligned to fit with
9855 // a pair of values. If we find such a case, use the non-undef mask's value.
9856 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9857 WidenedMask.push_back(Mask[i + 1] / 2);
9858 continue;
9859 }
9860 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9861 WidenedMask.push_back(Mask[i] / 2);
9862 continue;
9863 }
9864
9865 // When zeroing, we need to spread the zeroing across both lanes to widen.
9866 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9867 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9868 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9869 WidenedMask.push_back(SM_SentinelZero);
9870 continue;
9871 }
9872 return false;
9873 }
9874
9875 // Finally check if the two mask values are adjacent and aligned with
9876 // a pair.
9877 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9878 WidenedMask.push_back(Mask[i] / 2);
9879 continue;
9880 }
9881
9882 // Otherwise we can't safely widen the elements used in this shuffle.
9883 return false;
9884 }
9885 assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9886, __PRETTY_FUNCTION__))
9886 "Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9886, __PRETTY_FUNCTION__))
;
9887
9888 return true;
9889}
9890
9891/// \brief Generic routine to split ector shuffle into half-sized shuffles.
9892///
9893/// This routine just extracts two subvectors, shuffles them independently, and
9894/// then concatenates them back together. This should work effectively with all
9895/// AVX vector shuffle types.
9896static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9897 SDValue V2, ArrayRef<int> Mask,
9898 SelectionDAG &DAG) {
9899 assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9900, __PRETTY_FUNCTION__))
9900 "Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9900, __PRETTY_FUNCTION__))
;
9901 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9901, __PRETTY_FUNCTION__))
;
9902 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 9902, __PRETTY_FUNCTION__))
;
9903
9904 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9905 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9906
9907 int NumElements = VT.getVectorNumElements();
9908 int SplitNumElements = NumElements / 2;
9909 MVT ScalarVT = VT.getScalarType();
9910 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9911
9912 SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9913 DAG.getIntPtrConstant(0));
9914 SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9915 DAG.getIntPtrConstant(SplitNumElements));
9916 SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9917 DAG.getIntPtrConstant(0));
9918 SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9919 DAG.getIntPtrConstant(SplitNumElements));
9920
9921 // Now create two 4-way blends of these half-width vectors.
9922 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9923 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9924 SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9925 for (int i = 0; i < SplitNumElements; ++i) {
9926 int M = HalfMask[i];
9927 if (M >= NumElements) {
9928 if (M >= NumElements + SplitNumElements)
9929 UseHiV2 = true;
9930 else
9931 UseLoV2 = true;
9932 V2BlendMask.push_back(M - NumElements);
9933 V1BlendMask.push_back(-1);
9934 BlendMask.push_back(SplitNumElements + i);
9935 } else if (M >= 0) {
9936 if (M >= SplitNumElements)
9937 UseHiV1 = true;
9938 else
9939 UseLoV1 = true;
9940 V2BlendMask.push_back(-1);
9941 V1BlendMask.push_back(M);
9942 BlendMask.push_back(i);
9943 } else {
9944 V2BlendMask.push_back(-1);
9945 V1BlendMask.push_back(-1);
9946 BlendMask.push_back(-1);
9947 }
9948 }
9949
9950 // Because the lowering happens after all combining takes place, we need to
9951 // manually combine these blend masks as much as possible so that we create
9952 // a minimal number of high-level vector shuffle nodes.
9953
9954 // First try just blending the halves of V1 or V2.
9955 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9956 return DAG.getUNDEF(SplitVT);
9957 if (!UseLoV2 && !UseHiV2)
9958 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9959 if (!UseLoV1 && !UseHiV1)
9960 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9961
9962 SDValue V1Blend, V2Blend;
9963 if (UseLoV1 && UseHiV1) {
9964 V1Blend =
9965 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9966 } else {
9967 // We only use half of V1 so map the usage down into the final blend mask.
9968 V1Blend = UseLoV1 ? LoV1 : HiV1;
9969 for (int i = 0; i < SplitNumElements; ++i)
9970 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9971 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9972 }
9973 if (UseLoV2 && UseHiV2) {
9974 V2Blend =
9975 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9976 } else {
9977 // We only use half of V2 so map the usage down into the final blend mask.
9978 V2Blend = UseLoV2 ? LoV2 : HiV2;
9979 for (int i = 0; i < SplitNumElements; ++i)
9980 if (BlendMask[i] >= SplitNumElements)
9981 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
9982 }
9983 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
9984 };
9985 SDValue Lo = HalfBlend(LoMask);
9986 SDValue Hi = HalfBlend(HiMask);
9987 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
9988}
9989
9990/// \brief Either split a vector in halves or decompose the shuffles and the
9991/// blend.
9992///
9993/// This is provided as a good fallback for many lowerings of non-single-input
9994/// shuffles with more than one 128-bit lane. In those cases, we want to select
9995/// between splitting the shuffle into 128-bit components and stitching those
9996/// back together vs. extracting the single-input shuffles and blending those
9997/// results.
9998static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
9999 SDValue V2, ArrayRef<int> Mask,
10000 SelectionDAG &DAG) {
10001 assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__))
10002 "lower single-input shuffles as it "((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__))
10003 "could then recurse on itself.")((!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
"lower single-input shuffles as it " "could then recurse on itself."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This routine must not be used to \" \"lower single-input shuffles as it \" \"could then recurse on itself.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10003, __PRETTY_FUNCTION__))
;
10004 int Size = Mask.size();
10005
10006 // If this can be modeled as a broadcast of two elements followed by a blend,
10007 // prefer that lowering. This is especially important because broadcasts can
10008 // often fold with memory operands.
10009 auto DoBothBroadcast = [&] {
10010 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10011 for (int M : Mask)
10012 if (M >= Size) {
10013 if (V2BroadcastIdx == -1)
10014 V2BroadcastIdx = M - Size;
10015 else if (M - Size != V2BroadcastIdx)
10016 return false;
10017 } else if (M >= 0) {
10018 if (V1BroadcastIdx == -1)
10019 V1BroadcastIdx = M;
10020 else if (M != V1BroadcastIdx)
10021 return false;
10022 }
10023 return true;
10024 };
10025 if (DoBothBroadcast())
10026 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10027 DAG);
10028
10029 // If the inputs all stem from a single 128-bit lane of each input, then we
10030 // split them rather than blending because the split will decompose to
10031 // unusually few instructions.
10032 int LaneCount = VT.getSizeInBits() / 128;
10033 int LaneSize = Size / LaneCount;
10034 SmallBitVector LaneInputs[2];
10035 LaneInputs[0].resize(LaneCount, false);
10036 LaneInputs[1].resize(LaneCount, false);
10037 for (int i = 0; i < Size; ++i)
10038 if (Mask[i] >= 0)
10039 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10040 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10041 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10042
10043 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10044 // that the decomposed single-input shuffles don't end up here.
10045 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10046}
10047
10048/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10049/// a permutation and blend of those lanes.
10050///
10051/// This essentially blends the out-of-lane inputs to each lane into the lane
10052/// from a permuted copy of the vector. This lowering strategy results in four
10053/// instructions in the worst case for a single-input cross lane shuffle which
10054/// is lower than any other fully general cross-lane shuffle strategy I'm aware
10055/// of. Special cases for each particular shuffle pattern should be handled
10056/// prior to trying this lowering.
10057static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10058 SDValue V1, SDValue V2,
10059 ArrayRef<int> Mask,
10060 SelectionDAG &DAG) {
10061 // FIXME: This should probably be generalized for 512-bit vectors as well.
10062 assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!")((VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == 256 && \"Only for 256-bit vector shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10062, __PRETTY_FUNCTION__))
;
10063 int LaneSize = Mask.size() / 2;
10064
10065 // If there are only inputs from one 128-bit lane, splitting will in fact be
10066 // less expensive. The flags track wether the given lane contains an element
10067 // that crosses to another lane.
10068 bool LaneCrossing[2] = {false, false};
10069 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10070 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10071 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10072 if (!LaneCrossing[0] || !LaneCrossing[1])
10073 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10074
10075 if (isSingleInputShuffleMask(Mask)) {
10076 SmallVector<int, 32> FlippedBlendMask;
10077 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10078 FlippedBlendMask.push_back(
10079 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10080 ? Mask[i]
10081 : Mask[i] % LaneSize +
10082 (i / LaneSize) * LaneSize + Size));
10083
10084 // Flip the vector, and blend the results which should now be in-lane. The
10085 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10086 // 5 for the high source. The value 3 selects the high half of source 2 and
10087 // the value 2 selects the low half of source 2. We only use source 2 to
10088 // allow folding it into a memory operand.
10089 unsigned PERMMask = 3 | 2 << 4;
10090 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10091 V1, DAG.getConstant(PERMMask, MVT::i8));
10092 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10093 }
10094
10095 // This now reduces to two single-input shuffles of V1 and V2 which at worst
10096 // will be handled by the above logic and a blend of the results, much like
10097 // other patterns in AVX.
10098 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10099}
10100
10101/// \brief Handle lowering 2-lane 128-bit shuffles.
10102static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10103 SDValue V2, ArrayRef<int> Mask,
10104 const X86Subtarget *Subtarget,
10105 SelectionDAG &DAG) {
10106 // Blends are faster and handle all the non-lane-crossing cases.
10107 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10108 Subtarget, DAG))
10109 return Blend;
10110
10111 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10112 VT.getVectorNumElements() / 2);
10113 // Check for patterns which can be matched with a single insert of a 128-bit
10114 // subvector.
10115 if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10116 isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10117 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10118 DAG.getIntPtrConstant(0));
10119 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10120 Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10121 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10122 }
10123 if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10124 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10125 DAG.getIntPtrConstant(0));
10126 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10127 DAG.getIntPtrConstant(2));
10128 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10129 }
10130
10131 // Otherwise form a 128-bit permutation.
10132 // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10133 unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10134 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10135 DAG.getConstant(PermMask, MVT::i8));
10136}
10137
10138/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10139/// shuffling each lane.
10140///
10141/// This will only succeed when the result of fixing the 128-bit lanes results
10142/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10143/// each 128-bit lanes. This handles many cases where we can quickly blend away
10144/// the lane crosses early and then use simpler shuffles within each lane.
10145///
10146/// FIXME: It might be worthwhile at some point to support this without
10147/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10148/// in x86 only floating point has interesting non-repeating shuffles, and even
10149/// those are still *marginally* more expensive.
10150static SDValue lowerVectorShuffleByMerging128BitLanes(
10151 SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10152 const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10153 assert(!isSingleInputShuffleMask(Mask) &&((!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This is only useful with multiple inputs.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10154, __PRETTY_FUNCTION__))
10154 "This is only useful with multiple inputs.")((!isSingleInputShuffleMask(Mask) && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!isSingleInputShuffleMask(Mask) && \"This is only useful with multiple inputs.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10154, __PRETTY_FUNCTION__))
;
10155
10156 int Size = Mask.size();
10157 int LaneSize = 128 / VT.getScalarSizeInBits();
10158 int NumLanes = Size / LaneSize;
10159 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.")((NumLanes > 1 && "Only handles 256-bit and wider shuffles."
) ? static_cast<void> (0) : __assert_fail ("NumLanes > 1 && \"Only handles 256-bit and wider shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10159, __PRETTY_FUNCTION__))
;
10160
10161 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10162 // check whether the in-128-bit lane shuffles share a repeating pattern.
10163 SmallVector<int, 4> Lanes;
10164 Lanes.resize(NumLanes, -1);
10165 SmallVector<int, 4> InLaneMask;
10166 InLaneMask.resize(LaneSize, -1);
10167 for (int i = 0; i < Size; ++i) {
10168 if (Mask[i] < 0)
10169 continue;
10170
10171 int j = i / LaneSize;
10172
10173 if (Lanes[j] < 0) {
10174 // First entry we've seen for this lane.
10175 Lanes[j] = Mask[i] / LaneSize;
10176 } else if (Lanes[j] != Mask[i] / LaneSize) {
10177 // This doesn't match the lane selected previously!
10178 return SDValue();
10179 }
10180
10181 // Check that within each lane we have a consistent shuffle mask.
10182 int k = i % LaneSize;
10183 if (InLaneMask[k] < 0) {
10184 InLaneMask[k] = Mask[i] % LaneSize;
10185 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10186 // This doesn't fit a repeating in-lane mask.
10187 return SDValue();
10188 }
10189 }
10190
10191 // First shuffle the lanes into place.
10192 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10193 VT.getSizeInBits() / 64);
10194 SmallVector<int, 8> LaneMask;
10195 LaneMask.resize(NumLanes * 2, -1);
10196 for (int i = 0; i < NumLanes; ++i)
10197 if (Lanes[i] >= 0) {
10198 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10199 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10200 }
10201
10202 V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10203 V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10204 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10205
10206 // Cast it back to the type we actually want.
10207 LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10208
10209 // Now do a simple shuffle that isn't lane crossing.
10210 SmallVector<int, 8> NewMask;
10211 NewMask.resize(Size, -1);
10212 for (int i = 0; i < Size; ++i)
10213 if (Mask[i] >= 0)
10214 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10215 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10216, __PRETTY_FUNCTION__))
10216 "Must not introduce lane crosses at this point!")((!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10216, __PRETTY_FUNCTION__))
;
10217
10218 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10219}
10220
10221/// \brief Test whether the specified input (0 or 1) is in-place blended by the
10222/// given mask.
10223///
10224/// This returns true if the elements from a particular input are already in the
10225/// slot required by the given mask and require no permutation.
10226static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10227 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10227, __PRETTY_FUNCTION__))
;
10228 int Size = Mask.size();
10229 for (int i = 0; i < Size; ++i)
10230 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10231 return false;
10232
10233 return true;
10234}
10235
10236/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10237///
10238/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10239/// isn't available.
10240static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10241 const X86Subtarget *Subtarget,
10242 SelectionDAG &DAG) {
10243 SDLoc DL(Op);
10244 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10244, __PRETTY_FUNCTION__))
;
10245 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10245, __PRETTY_FUNCTION__))
;
10246 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10247 ArrayRef<int> Mask = SVOp->getMask();
10248 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10248, __PRETTY_FUNCTION__))
;
10249
10250 SmallVector<int, 4> WidenedMask;
10251 if (canWidenShuffleElements(Mask, WidenedMask))
10252 return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10253 DAG);
10254
10255 if (isSingleInputShuffleMask(Mask)) {
10256 // Check for being able to broadcast a single element.
10257 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10258 Mask, Subtarget, DAG))
10259 return Broadcast;
10260
10261 // Use low duplicate instructions for masks that match their pattern.
10262 if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10263 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10264
10265 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10266 // Non-half-crossing single input shuffles can be lowerid with an
10267 // interleaved permutation.
10268 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10269 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10270 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10271 DAG.getConstant(VPERMILPMask, MVT::i8));
10272 }
10273
10274 // With AVX2 we have direct support for this permutation.
10275 if (Subtarget->hasAVX2())
10276 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10277 getV4X86ShuffleImm8ForMask(Mask, DAG));
10278
10279 // Otherwise, fall back.
10280 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10281 DAG);
10282 }
10283
10284 // X86 has dedicated unpack instructions that can handle specific blend
10285 // operations: UNPCKH and UNPCKL.
10286 if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10287 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10288 if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10289 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10290
10291 // If we have a single input to the zero element, insert that into V1 if we
10292 // can do so cheaply.
10293 int NumV2Elements =
10294 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10295 if (NumV2Elements == 1 && Mask[0] >= 4)
10296 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10297 MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10298 return Insertion;
10299
10300 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10301 Subtarget, DAG))
10302 return Blend;
10303
10304 // Check if the blend happens to exactly fit that of SHUFPD.
10305 if ((Mask[0] == -1 || Mask[0] < 2) &&
10306 (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10307 (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10308 (Mask[3] == -1 || Mask[3] >= 6)) {
10309 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10310 ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10311 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10312 DAG.getConstant(SHUFPDMask, MVT::i8));
10313 }
10314 if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10315 (Mask[1] == -1 || Mask[1] < 2) &&
10316 (Mask[2] == -1 || Mask[2] >= 6) &&
10317 (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10318 unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10319 ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10320 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10321 DAG.getConstant(SHUFPDMask, MVT::i8));
10322 }
10323
10324 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10325 // shuffle. However, if we have AVX2 and either inputs are already in place,
10326 // we will be able to shuffle even across lanes the other input in a single
10327 // instruction so skip this pattern.
10328 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10329 isShuffleMaskInputInPlace(1, Mask))))
10330 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10331 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10332 return Result;
10333
10334 // If we have AVX2 then we always want to lower with a blend because an v4 we
10335 // can fully permute the elements.
10336 if (Subtarget->hasAVX2())
10337 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10338 Mask, DAG);
10339
10340 // Otherwise fall back on generic lowering.
10341 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10342}
10343
10344/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10345///
10346/// This routine is only called when we have AVX2 and thus a reasonable
10347/// instruction set for v4i64 shuffling..
10348static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10349 const X86Subtarget *Subtarget,
10350 SelectionDAG &DAG) {
10351 SDLoc DL(Op);
10352 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10352, __PRETTY_FUNCTION__))
;
10353 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10353, __PRETTY_FUNCTION__))
;
10354 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10355 ArrayRef<int> Mask = SVOp->getMask();
10356 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10356, __PRETTY_FUNCTION__))
;
10357 assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10357, __PRETTY_FUNCTION__))
;
10358
10359 SmallVector<int, 4> WidenedMask;
10360 if (canWidenShuffleElements(Mask, WidenedMask))
10361 return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10362 DAG);
10363
10364 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10365 Subtarget, DAG))
10366 return Blend;
10367
10368 // Check for being able to broadcast a single element.
10369 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10370 Mask, Subtarget, DAG))
10371 return Broadcast;
10372
10373 // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10374 // use lower latency instructions that will operate on both 128-bit lanes.
10375 SmallVector<int, 2> RepeatedMask;
10376 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10377 if (isSingleInputShuffleMask(Mask)) {
10378 int PSHUFDMask[] = {-1, -1, -1, -1};
10379 for (int i = 0; i < 2; ++i)
10380 if (RepeatedMask[i] >= 0) {
10381 PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10382 PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10383 }
10384 return DAG.getNode(
10385 ISD::BITCAST, DL, MVT::v4i64,
10386 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10387 DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10388 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10389 }
10390
10391 // Use dedicated unpack instructions for masks that match their pattern.
10392 if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10393 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10394 if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10395 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10396 }
10397
10398 // AVX2 provides a direct instruction for permuting a single input across
10399 // lanes.
10400 if (isSingleInputShuffleMask(Mask))
10401 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10402 getV4X86ShuffleImm8ForMask(Mask, DAG));
10403
10404 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10405 // shuffle. However, if we have AVX2 and either inputs are already in place,
10406 // we will be able to shuffle even across lanes the other input in a single
10407 // instruction so skip this pattern.
10408 if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10409 isShuffleMaskInputInPlace(1, Mask))))
10410 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10411 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10412 return Result;
10413
10414 // Otherwise fall back on generic blend lowering.
10415 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10416 Mask, DAG);
10417}
10418
10419/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10420///
10421/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10422/// isn't available.
10423static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10424 const X86Subtarget *Subtarget,
10425 SelectionDAG &DAG) {
10426 SDLoc DL(Op);
10427 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10427, __PRETTY_FUNCTION__))
;
10428 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10428, __PRETTY_FUNCTION__))
;
10429 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10430 ArrayRef<int> Mask = SVOp->getMask();
10431 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10431, __PRETTY_FUNCTION__))
;
10432
10433 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10434 Subtarget, DAG))
10435 return Blend;
10436
10437 // Check for being able to broadcast a single element.
10438 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10439 Mask, Subtarget, DAG))
10440 return Broadcast;
10441
10442 // If the shuffle mask is repeated in each 128-bit lane, we have many more
10443 // options to efficiently lower the shuffle.
10444 SmallVector<int, 4> RepeatedMask;
10445 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10446 assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10447, __PRETTY_FUNCTION__))
10447 "Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10447, __PRETTY_FUNCTION__))
;
10448
10449 // Use even/odd duplicate instructions for masks that match their pattern.
10450 if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10451 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10452 if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10453 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10454
10455 if (isSingleInputShuffleMask(Mask))
10456 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10457 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10458
10459 // Use dedicated unpack instructions for masks that match their pattern.
10460 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10461 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10462 if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10463 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10464
10465 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10466 // have already handled any direct blends. We also need to squash the
10467 // repeated mask into a simulated v4f32 mask.
10468 for (int i = 0; i < 4; ++i)
10469 if (RepeatedMask[i] >= 8)
10470 RepeatedMask[i] -= 4;
10471 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10472 }
10473
10474 // If we have a single input shuffle with different shuffle patterns in the
10475 // two 128-bit lanes use the variable mask to VPERMILPS.
10476 if (isSingleInputShuffleMask(Mask)) {
10477 SDValue VPermMask[8];
10478 for (int i = 0; i < 8; ++i)
10479 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10480 : DAG.getConstant(Mask[i], MVT::i32);
10481 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10482 return DAG.getNode(
10483 X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10484 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10485
10486 if (Subtarget->hasAVX2())
10487 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10488 DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10489 DAG.getNode(ISD::BUILD_VECTOR, DL,
10490 MVT::v8i32, VPermMask)),
10491 V1);
10492
10493 // Otherwise, fall back.
10494 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10495 DAG);
10496 }
10497
10498 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10499 // shuffle.
10500 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10501 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10502 return Result;
10503
10504 // If we have AVX2 then we always want to lower with a blend because at v8 we
10505 // can fully permute the elements.
10506 if (Subtarget->hasAVX2())
10507 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10508 Mask, DAG);
10509
10510 // Otherwise fall back on generic lowering.
10511 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10512}
10513
10514/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10515///
10516/// This routine is only called when we have AVX2 and thus a reasonable
10517/// instruction set for v8i32 shuffling..
10518static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10519 const X86Subtarget *Subtarget,
10520 SelectionDAG &DAG) {
10521 SDLoc DL(Op);
10522 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10522, __PRETTY_FUNCTION__))
;
10523 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10523, __PRETTY_FUNCTION__))
;
10524 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10525 ArrayRef<int> Mask = SVOp->getMask();
10526 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10526, __PRETTY_FUNCTION__))
;
10527 assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10527, __PRETTY_FUNCTION__))
;
10528
10529 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10530 Subtarget, DAG))
10531 return Blend;
10532
10533 // Check for being able to broadcast a single element.
10534 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10535 Mask, Subtarget, DAG))
10536 return Broadcast;
10537
10538 // If the shuffle mask is repeated in each 128-bit lane we can use more
10539 // efficient instructions that mirror the shuffles across the two 128-bit
10540 // lanes.
10541 SmallVector<int, 4> RepeatedMask;
10542 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10543 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10543, __PRETTY_FUNCTION__))
;
10544 if (isSingleInputShuffleMask(Mask))
10545 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10546 getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10547
10548 // Use dedicated unpack instructions for masks that match their pattern.
10549 if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10550 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10551 if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10552 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10553 }
10554
10555 // If the shuffle patterns aren't repeated but it is a single input, directly
10556 // generate a cross-lane VPERMD instruction.
10557 if (isSingleInputShuffleMask(Mask)) {
10558 SDValue VPermMask[8];
10559 for (int i = 0; i < 8; ++i)
10560 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10561 : DAG.getConstant(Mask[i], MVT::i32);
10562 return DAG.getNode(
10563 X86ISD::VPERMV, DL, MVT::v8i32,
10564 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10565 }
10566
10567 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10568 // shuffle.
10569 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10570 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10571 return Result;
10572
10573 // Otherwise fall back on generic blend lowering.
10574 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10575 Mask, DAG);
10576}
10577
10578/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10579///
10580/// This routine is only called when we have AVX2 and thus a reasonable
10581/// instruction set for v16i16 shuffling..
10582static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10583 const X86Subtarget *Subtarget,
10584 SelectionDAG &DAG) {
10585 SDLoc DL(Op);
10586 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __PRETTY_FUNCTION__))
;
10587 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10587, __PRETTY_FUNCTION__))
;
10588 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10589 ArrayRef<int> Mask = SVOp->getMask();
10590 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10590, __PRETTY_FUNCTION__))
;
10591 assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10591, __PRETTY_FUNCTION__))
;
10592
10593 // Check for being able to broadcast a single element.
10594 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10595 Mask, Subtarget, DAG))
10596 return Broadcast;
10597
10598 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10599 Subtarget, DAG))
10600 return Blend;
10601
10602 // Use dedicated unpack instructions for masks that match their pattern.
10603 if (isShuffleEquivalent(Mask,
10604 // First 128-bit lane:
10605 0, 16, 1, 17, 2, 18, 3, 19,
10606 // Second 128-bit lane:
10607 8, 24, 9, 25, 10, 26, 11, 27))
10608 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10609 if (isShuffleEquivalent(Mask,
10610 // First 128-bit lane:
10611 4, 20, 5, 21, 6, 22, 7, 23,
10612 // Second 128-bit lane:
10613 12, 28, 13, 29, 14, 30, 15, 31))
10614 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10615
10616 if (isSingleInputShuffleMask(Mask)) {
10617 // There are no generalized cross-lane shuffle operations available on i16
10618 // element types.
10619 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10620 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10621 Mask, DAG);
10622
10623 SDValue PSHUFBMask[32];
10624 for (int i = 0; i < 16; ++i) {
10625 if (Mask[i] == -1) {
10626 PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10627 continue;
10628 }
10629
10630 int M = i < 8 ? Mask[i] : Mask[i] - 8;
10631 assert(M >= 0 && M < 8 && "Invalid single-input mask!")((M >= 0 && M < 8 && "Invalid single-input mask!"
) ? static_cast<void> (0) : __assert_fail ("M >= 0 && M < 8 && \"Invalid single-input mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10631, __PRETTY_FUNCTION__))
;
10632 PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10633 PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10634 }
10635 return DAG.getNode(
10636 ISD::BITCAST, DL, MVT::v16i16,
10637 DAG.getNode(
10638 X86ISD::PSHUFB, DL, MVT::v32i8,
10639 DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10640 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10641 }
10642
10643 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10644 // shuffle.
10645 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10646 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10647 return Result;
10648
10649 // Otherwise fall back on generic lowering.
10650 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10651}
10652
10653/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10654///
10655/// This routine is only called when we have AVX2 and thus a reasonable
10656/// instruction set for v32i8 shuffling..
10657static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10658 const X86Subtarget *Subtarget,
10659 SelectionDAG &DAG) {
10660 SDLoc DL(Op);
10661 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10661, __PRETTY_FUNCTION__))
;
10662 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10662, __PRETTY_FUNCTION__))
;
10663 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10664 ArrayRef<int> Mask = SVOp->getMask();
10665 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10665, __PRETTY_FUNCTION__))
;
10666 assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __PRETTY_FUNCTION__))
;
10667
10668 // Check for being able to broadcast a single element.
10669 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10670 Mask, Subtarget, DAG))
10671 return Broadcast;
10672
10673 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10674 Subtarget, DAG))
10675 return Blend;
10676
10677 // Use dedicated unpack instructions for masks that match their pattern.
10678 // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10679 // 256-bit lanes.
10680 if (isShuffleEquivalent(
10681 Mask,
10682 // First 128-bit lane:
10683 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10684 // Second 128-bit lane:
10685 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10686 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10687 if (isShuffleEquivalent(
10688 Mask,
10689 // First 128-bit lane:
10690 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10691 // Second 128-bit lane:
10692 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10693 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10694
10695 if (isSingleInputShuffleMask(Mask)) {
10696 // There are no generalized cross-lane shuffle operations available on i8
10697 // element types.
10698 if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10699 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10700 Mask, DAG);
10701
10702 SDValue PSHUFBMask[32];
10703 for (int i = 0; i < 32; ++i)
10704 PSHUFBMask[i] =
10705 Mask[i] < 0
10706 ? DAG.getUNDEF(MVT::i8)
10707 : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10708
10709 return DAG.getNode(
10710 X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10711 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10712 }
10713
10714 // Try to simplify this by merging 128-bit lanes to enable a lane-based
10715 // shuffle.
10716 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10717 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10718 return Result;
10719
10720 // Otherwise fall back on generic lowering.
10721 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10722}
10723
10724/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10725///
10726/// This routine either breaks down the specific type of a 256-bit x86 vector
10727/// shuffle or splits it into two 128-bit shuffles and fuses the results back
10728/// together based on the available instructions.
10729static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10730 MVT VT, const X86Subtarget *Subtarget,
10731 SelectionDAG &DAG) {
10732 SDLoc DL(Op);
10733 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10734 ArrayRef<int> Mask = SVOp->getMask();
10735
10736 // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10737 // check for those subtargets here and avoid much of the subtarget querying in
10738 // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10739 // ability to manipulate a 256-bit vector with integer types. Since we'll use
10740 // floating point types there eventually, just immediately cast everything to
10741 // a float and operate entirely in that domain.
10742 if (VT.isInteger() && !Subtarget->hasAVX2()) {
10743 int ElementBits = VT.getScalarSizeInBits();
10744 if (ElementBits < 32)
10745 // No floating point type available, decompose into 128-bit vectors.
10746 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10747
10748 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10749 VT.getVectorNumElements());
10750 V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10751 V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10752 return DAG.getNode(ISD::BITCAST, DL, VT,
10753 DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10754 }
10755
10756 switch (VT.SimpleTy) {
10757 case MVT::v4f64:
10758 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10759 case MVT::v4i64:
10760 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10761 case MVT::v8f32:
10762 return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10763 case MVT::v8i32:
10764 return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10765 case MVT::v16i16:
10766 return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10767 case MVT::v32i8:
10768 return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10769
10770 default:
10771 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10771)
;
10772 }
10773}
10774
10775/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10776static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10777 const X86Subtarget *Subtarget,
10778 SelectionDAG &DAG) {
10779 SDLoc DL(Op);
10780 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10780, __PRETTY_FUNCTION__))
;
10781 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10781, __PRETTY_FUNCTION__))
;
10782 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10783 ArrayRef<int> Mask = SVOp->getMask();
10784 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10784, __PRETTY_FUNCTION__))
;
10785
10786 // X86 has dedicated unpack instructions that can handle specific blend
10787 // operations: UNPCKH and UNPCKL.
10788 if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10789 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10790 if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10791 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
10792
10793 // FIXME: Implement direct support for this type!
10794 return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
10795}
10796
10797/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
10798static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10799 const X86Subtarget *Subtarget,
10800 SelectionDAG &DAG) {
10801 SDLoc DL(Op);
10802 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10802, __PRETTY_FUNCTION__))
;
10803 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10803, __PRETTY_FUNCTION__))
;
10804 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10805 ArrayRef<int> Mask = SVOp->getMask();
10806 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10806, __PRETTY_FUNCTION__))
;
10807
10808 // Use dedicated unpack instructions for masks that match their pattern.
10809 if (isShuffleEquivalent(Mask,
10810 0, 16, 1, 17, 4, 20, 5, 21,
10811 8, 24, 9, 25, 12, 28, 13, 29))
10812 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
10813 if (isShuffleEquivalent(Mask,
10814 2, 18, 3, 19, 6, 22, 7, 23,
10815 10, 26, 11, 27, 14, 30, 15, 31))
10816 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
10817
10818 // FIXME: Implement direct support for this type!
10819 return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
10820}
10821
10822/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
10823static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10824 const X86Subtarget *Subtarget,
10825 SelectionDAG &DAG) {
10826 SDLoc DL(Op);
10827 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10827, __PRETTY_FUNCTION__))
;
10828 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10828, __PRETTY_FUNCTION__))
;
10829 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10830 ArrayRef<int> Mask = SVOp->getMask();
10831 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10831, __PRETTY_FUNCTION__))
;
10832
10833 // X86 has dedicated unpack instructions that can handle specific blend
10834 // operations: UNPCKH and UNPCKL.
10835 if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10836 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
10837 if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10838 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
10839
10840 // FIXME: Implement direct support for this type!
10841 return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
10842}
10843
10844/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10845static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10846 const X86Subtarget *Subtarget,
10847 SelectionDAG &DAG) {
10848 SDLoc DL(Op);
10849 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10849, __PRETTY_FUNCTION__))
;
10850 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10850, __PRETTY_FUNCTION__))
;
10851 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10852 ArrayRef<int> Mask = SVOp->getMask();
10853 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10853, __PRETTY_FUNCTION__))
;
10854
10855 // Use dedicated unpack instructions for masks that match their pattern.
10856 if (isShuffleEquivalent(Mask,
10857 0, 16, 1, 17, 4, 20, 5, 21,
10858 8, 24, 9, 25, 12, 28, 13, 29))
10859 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
10860 if (isShuffleEquivalent(Mask,
10861 2, 18, 3, 19, 6, 22, 7, 23,
10862 10, 26, 11, 27, 14, 30, 15, 31))
10863 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
10864
10865 // FIXME: Implement direct support for this type!
10866 return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
10867}
10868
10869/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10870static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10871 const X86Subtarget *Subtarget,
10872 SelectionDAG &DAG) {
10873 SDLoc DL(Op);
10874 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10874, __PRETTY_FUNCTION__))
;
10875 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10875, __PRETTY_FUNCTION__))
;
10876 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10877 ArrayRef<int> Mask = SVOp->getMask();
10878 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10878, __PRETTY_FUNCTION__))
;
10879 assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10879, __PRETTY_FUNCTION__))
;
10880
10881 // FIXME: Implement direct support for this type!
10882 return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10883}
10884
10885/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10886static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10887 const X86Subtarget *Subtarget,
10888 SelectionDAG &DAG) {
10889 SDLoc DL(Op);
10890 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10890, __PRETTY_FUNCTION__))
;
10891 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10891, __PRETTY_FUNCTION__))
;
10892 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10893 ArrayRef<int> Mask = SVOp->getMask();
10894 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10894, __PRETTY_FUNCTION__))
;
10895 assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10895, __PRETTY_FUNCTION__))
;
10896
10897 // FIXME: Implement direct support for this type!
10898 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10899}
10900
10901/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10902///
10903/// This routine either breaks down the specific type of a 512-bit x86 vector
10904/// shuffle or splits it into two 256-bit shuffles and fuses the results back
10905/// together based on the available instructions.
10906static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10907 MVT VT, const X86Subtarget *Subtarget,
10908 SelectionDAG &DAG) {
10909 SDLoc DL(Op);
10910 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10911 ArrayRef<int> Mask = SVOp->getMask();
10912 assert(Subtarget->hasAVX512() &&((Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10913, __PRETTY_FUNCTION__))
10913 "Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget->hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10913, __PRETTY_FUNCTION__))
;
10914
10915 // Check for being able to broadcast a single element.
10916 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
10917 Mask, Subtarget, DAG))
10918 return Broadcast;
10919
10920 // Dispatch to each element type for lowering. If we don't have supprot for
10921 // specific element type shuffles at 512 bits, immediately split them and
10922 // lower them. Each lowering routine of a given type is allowed to assume that
10923 // the requisite ISA extensions for that element type are available.
10924 switch (VT.SimpleTy) {
10925 case MVT::v8f64:
10926 return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10927 case MVT::v16f32:
10928 return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10929 case MVT::v8i64:
10930 return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10931 case MVT::v16i32:
10932 return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10933 case MVT::v32i16:
10934 if (Subtarget->hasBWI())
10935 return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10936 break;
10937 case MVT::v64i8:
10938 if (Subtarget->hasBWI())
10939 return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10940 break;
10941
10942 default:
10943 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10943)
;
10944 }
10945
10946 // Otherwise fall back on splitting.
10947 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10948}
10949
10950/// \brief Top-level lowering for x86 vector shuffles.
10951///
10952/// This handles decomposition, canonicalization, and lowering of all x86
10953/// vector shuffles. Most of the specific lowering strategies are encapsulated
10954/// above in helper routines. The canonicalization attempts to widen shuffles
10955/// to involve fewer lanes of wider elements, consolidate symmetric patterns
10956/// s.t. only one of the two inputs needs to be tested, etc.
10957static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10958 SelectionDAG &DAG) {
10959 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10960 ArrayRef<int> Mask = SVOp->getMask();
10961 SDValue V1 = Op.getOperand(0);
10962 SDValue V2 = Op.getOperand(1);
10963 MVT VT = Op.getSimpleValueType();
10964 int NumElements = VT.getVectorNumElements();
10965 SDLoc dl(Op);
10966
10967 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles")((VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() != 64 && \"Can't lower MMX shuffles\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 10967, __PRETTY_FUNCTION__))
;
10968
10969 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10970 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10971 if (V1IsUndef && V2IsUndef)
10972 return DAG.getUNDEF(VT);
10973
10974 // When we create a shuffle node we put the UNDEF node to second operand,
10975 // but in some cases the first operand may be transformed to UNDEF.
10976 // In this case we should just commute the node.
10977 if (V1IsUndef)
10978 return DAG.getCommutedVectorShuffle(*SVOp);
10979
10980 // Check for non-undef masks pointing at an undef vector and make the masks
10981 // undef as well. This makes it easier to match the shuffle based solely on
10982 // the mask.
10983 if (V2IsUndef)
10984 for (int M : Mask)
10985 if (M >= NumElements) {
10986 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
10987 for (int &M : NewMask)
10988 if (M >= NumElements)
10989 M = -1;
10990 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
10991 }
10992
10993 // Try to collapse shuffles into using a vector type with fewer elements but
10994 // wider element types. We cap this to not form integers or floating point
10995 // elements wider than 64 bits, but it might be interesting to form i128
10996 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
10997 SmallVector<int, 16> WidenedMask;
10998 if (VT.getScalarSizeInBits() < 64 &&
10999 canWidenShuffleElements(Mask, WidenedMask)) {
11000 MVT NewEltVT = VT.isFloatingPoint()
11001 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11002 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11003 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11004 // Make sure that the new vector type is legal. For example, v2f64 isn't
11005 // legal on SSE1.
11006 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11007 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11008 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11009 return DAG.getNode(ISD::BITCAST, dl, VT,
11010 DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11011 }
11012 }
11013
11014 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11015 for (int M : SVOp->getMask())
11016 if (M < 0)
11017 ++NumUndefElements;
11018 else if (M < NumElements)
11019 ++NumV1Elements;
11020 else
11021 ++NumV2Elements;
11022
11023 // Commute the shuffle as needed such that more elements come from V1 than
11024 // V2. This allows us to match the shuffle pattern strictly on how many
11025 // elements come from V1 without handling the symmetric cases.
11026 if (NumV2Elements > NumV1Elements)
11027 return DAG.getCommutedVectorShuffle(*SVOp);
11028
11029 // When the number of V1 and V2 elements are the same, try to minimize the
11030 // number of uses of V2 in the low half of the vector. When that is tied,
11031 // ensure that the sum of indices for V1 is equal to or lower than the sum
11032 // indices for V2. When those are equal, try to ensure that the number of odd
11033 // indices for V1 is lower than the number of odd indices for V2.
11034 if (NumV1Elements == NumV2Elements) {
11035 int LowV1Elements = 0, LowV2Elements = 0;
11036 for (int M : SVOp->getMask().slice(0, NumElements / 2))
11037 if (M >= NumElements)
11038 ++LowV2Elements;
11039 else if (M >= 0)
11040 ++LowV1Elements;
11041 if (LowV2Elements > LowV1Elements) {
11042 return DAG.getCommutedVectorShuffle(*SVOp);
11043 } else if (LowV2Elements == LowV1Elements) {
11044 int SumV1Indices = 0, SumV2Indices = 0;
11045 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11046 if (SVOp->getMask()[i] >= NumElements)
11047 SumV2Indices += i;
11048 else if (SVOp->getMask()[i] >= 0)
11049 SumV1Indices += i;
11050 if (SumV2Indices < SumV1Indices) {
11051 return DAG.getCommutedVectorShuffle(*SVOp);
11052 } else if (SumV2Indices == SumV1Indices) {
11053 int NumV1OddIndices = 0, NumV2OddIndices = 0;
11054 for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11055 if (SVOp->getMask()[i] >= NumElements)
11056 NumV2OddIndices += i % 2;
11057 else if (SVOp->getMask()[i] >= 0)
11058 NumV1OddIndices += i % 2;
11059 if (NumV2OddIndices < NumV1OddIndices)
11060 return DAG.getCommutedVectorShuffle(*SVOp);
11061 }
11062 }
11063 }
11064
11065 // For each vector width, delegate to a specialized lowering routine.
11066 if (VT.getSizeInBits() == 128)
11067 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11068
11069 if (VT.getSizeInBits() == 256)
11070 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11071
11072 // Force AVX-512 vectors to be scalarized for now.
11073 // FIXME: Implement AVX-512 support!
11074 if (VT.getSizeInBits() == 512)
11075 return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11076
11077 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11077)
;
11078}
11079
11080
11081//===----------------------------------------------------------------------===//
11082// Legacy vector shuffle lowering
11083//
11084// This code is the legacy code handling vector shuffles until the above
11085// replaces its functionality and performance.
11086//===----------------------------------------------------------------------===//
11087
11088static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11089 bool hasInt256, unsigned *MaskOut = nullptr) {
11090 MVT EltVT = VT.getVectorElementType();
11091
11092 // There is no blend with immediate in AVX-512.
11093 if (VT.is512BitVector())
11094 return false;
11095
11096 if (!hasSSE41 || EltVT == MVT::i8)
11097 return false;
11098 if (!hasInt256 && VT == MVT::v16i16)
11099 return false;
11100
11101 unsigned MaskValue = 0;
11102 unsigned NumElems = VT.getVectorNumElements();
11103 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11104 unsigned NumLanes = (NumElems - 1) / 8 + 1;
11105 unsigned NumElemsInLane = NumElems / NumLanes;
11106
11107 // Blend for v16i16 should be symetric for the both lanes.
11108 for (unsigned i = 0; i < NumElemsInLane; ++i) {
11109
11110 int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11111 int EltIdx = MaskVals[i];
11112
11113 if ((EltIdx < 0 || EltIdx == (int)i) &&
11114 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11115 continue;
11116
11117 if (((unsigned)EltIdx == (i + NumElems)) &&
11118 (SndLaneEltIdx < 0 ||
11119 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11120 MaskValue |= (1 << i);
11121 else
11122 return false;
11123 }
11124
11125 if (MaskOut)
11126 *MaskOut = MaskValue;
11127 return true;
11128}
11129
11130// Try to lower a shuffle node into a simple blend instruction.
11131// This function assumes isBlendMask returns true for this
11132// SuffleVectorSDNode
11133static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11134 unsigned MaskValue,
11135 const X86Subtarget *Subtarget,
11136 SelectionDAG &DAG) {
11137 MVT VT = SVOp->getSimpleValueType(0);
11138 MVT EltVT = VT.getVectorElementType();
11139 assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))
11140 Subtarget->hasInt256() && "Trying to lower a "((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))
11141 "VECTOR_SHUFFLE to a Blend but "((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))
11142 "with the wrong mask"))((isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(
), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but "
"with the wrong mask")) ? static_cast<void> (0) : __assert_fail
("isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && \"Trying to lower a \" \"VECTOR_SHUFFLE to a Blend but \" \"with the wrong mask\")"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11142, __PRETTY_FUNCTION__))
;
11143 SDValue V1 = SVOp->getOperand(0);
11144 SDValue V2 = SVOp->getOperand(1);
11145 SDLoc dl(SVOp);
11146 unsigned NumElems = VT.getVectorNumElements();
11147
11148 // Convert i32 vectors to floating point if it is not AVX2.
11149 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11150 MVT BlendVT = VT;
11151 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11152 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11153 NumElems);
11154 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11155 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11156 }
11157
11158 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11159 DAG.getConstant(MaskValue, MVT::i32));
11160 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11161}
11162
11163/// In vector type \p VT, return true if the element at index \p InputIdx
11164/// falls on a different 128-bit lane than \p OutputIdx.
11165static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11166 unsigned OutputIdx) {
11167 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11168 return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11169}
11170
11171/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
11172/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
11173/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
11174/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11175/// zero.
11176static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11177 SelectionDAG &DAG) {
11178 MVT VT = V1.getSimpleValueType();
11179 assert(VT.is128BitVector() || VT.is256BitVector())((VT.is128BitVector() || VT.is256BitVector()) ? static_cast<
void> (0) : __assert_fail ("VT.is128BitVector() || VT.is256BitVector()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11179, __PRETTY_FUNCTION__))
;
11180
11181 MVT EltVT = VT.getVectorElementType();
11182 unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11183 unsigned NumElts = VT.getVectorNumElements();
11184
11185 SmallVector<SDValue, 32> PshufbMask;
11186 for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11187 int InputIdx = MaskVals[OutputIdx];
11188 unsigned InputByteIdx;
11189
11190 if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11191 InputByteIdx = 0x80;
11192 else {
11193 // Cross lane is not allowed.
11194 if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11195 return SDValue();
11196 InputByteIdx = InputIdx * EltSizeInBytes;
11197 // Index is an byte offset within the 128-bit lane.
11198 InputByteIdx &= 0xf;
11199 }
11200
11201 for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11202 PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11203 if (InputByteIdx != 0x80)
11204 ++InputByteIdx;
11205 }
11206 }
11207
11208 MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11209 if (ShufVT != VT)
11210 V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11211 return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11212 DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11213}
11214
11215// v8i16 shuffles - Prefer shuffles in the following order:
11216// 1. [all] pshuflw, pshufhw, optional move
11217// 2. [ssse3] 1 x pshufb
11218// 3. [ssse3] 2 x pshufb + 1 x por
11219// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11220static SDValue
11221LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11222 SelectionDAG &DAG) {
11223 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11224 SDValue V1 = SVOp->getOperand(0);
11225 SDValue V2 = SVOp->getOperand(1);
11226 SDLoc dl(SVOp);
11227 SmallVector<int, 8> MaskVals;
11228
11229 // Determine if more than 1 of the words in each of the low and high quadwords
11230 // of the result come from the same quadword of one of the two inputs. Undef
11231 // mask values count as coming from any quadword, for better codegen.
11232 //
11233 // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11234 // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11235 unsigned LoQuad[] = { 0, 0, 0, 0 };
11236 unsigned HiQuad[] = { 0, 0, 0, 0 };
11237 // Indices of quads used.
11238 std::bitset<4> InputQuads;
11239 for (unsigned i = 0; i < 8; ++i) {
11240 unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11241 int EltIdx = SVOp->getMaskElt(i);
11242 MaskVals.push_back(EltIdx);
11243 if (EltIdx < 0) {
11244 ++Quad[0];
11245 ++Quad[1];
11246 ++Quad[2];
11247 ++Quad[3];
11248 continue;
11249 }
11250 ++Quad[EltIdx / 4];
11251 InputQuads.set(EltIdx / 4);
11252 }
11253
11254 int BestLoQuad = -1;
11255 unsigned MaxQuad = 1;
11256 for (unsigned i = 0; i < 4; ++i) {
11257 if (LoQuad[i] > MaxQuad) {
11258 BestLoQuad = i;
11259 MaxQuad = LoQuad[i];
11260 }
11261 }
11262
11263 int BestHiQuad = -1;
11264 MaxQuad = 1;
11265 for (unsigned i = 0; i < 4; ++i) {
11266 if (HiQuad[i] > MaxQuad) {
11267 BestHiQuad = i;
11268 MaxQuad = HiQuad[i];
11269 }
11270 }
11271
11272 // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11273 // of the two input vectors, shuffle them into one input vector so only a
11274 // single pshufb instruction is necessary. If there are more than 2 input
11275 // quads, disable the next transformation since it does not help SSSE3.
11276 bool V1Used = InputQuads[0] || InputQuads[1];
11277 bool V2Used = InputQuads[2] || InputQuads[3];
11278 if (Subtarget->hasSSSE3()) {
11279 if (InputQuads.count() == 2 && V1Used && V2Used) {
11280 BestLoQuad = InputQuads[0] ? 0 : 1;
11281 BestHiQuad = InputQuads[2] ? 2 : 3;
11282 }
11283 if (InputQuads.count() > 2) {
11284 BestLoQuad = -1;
11285 BestHiQuad = -1;
11286 }
11287 }
11288
11289 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11290 // the shuffle mask. If a quad is scored as -1, that means that it contains
11291 // words from all 4 input quadwords.
11292 SDValue NewV;
11293 if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11294 int MaskV[] = {
11295 BestLoQuad < 0 ? 0 : BestLoQuad,
11296 BestHiQuad < 0 ? 1 : BestHiQuad
11297 };
11298 NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11299 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11300 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11301 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11302
11303 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11304 // source words for the shuffle, to aid later transformations.
11305 bool AllWordsInNewV = true;
11306 bool InOrder[2] = { true, true };
11307 for (unsigned i = 0; i != 8; ++i) {
11308 int idx = MaskVals[i];
11309 if (idx != (int)i)
11310 InOrder[i/4] = false;
11311 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11312 continue;
11313 AllWordsInNewV = false;
11314 break;
11315 }
11316
11317 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11318 if (AllWordsInNewV) {
11319 for (int i = 0; i != 8; ++i) {
11320 int idx = MaskVals[i];
11321 if (idx < 0)
11322 continue;
11323 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11324 if ((idx != i) && idx < 4)
11325 pshufhw = false;
11326 if ((idx != i) && idx > 3)
11327 pshuflw = false;
11328 }
11329 V1 = NewV;
11330 V2Used = false;
11331 BestLoQuad = 0;
11332 BestHiQuad = 1;
11333 }
11334
11335 // If we've eliminated the use of V2, and the new mask is a pshuflw or
11336 // pshufhw, that's as cheap as it gets. Return the new shuffle.
11337 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11338 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11339 unsigned TargetMask = 0;
11340 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11341 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11342 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11343 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11344 getShufflePSHUFLWImmediate(SVOp);
11345 V1 = NewV.getOperand(0);
11346 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11347 }
11348 }
11349
11350 // Promote splats to a larger type which usually leads to more efficient code.
11351 // FIXME: Is this true if pshufb is available?
11352 if (SVOp->isSplat())
11353 return PromoteSplat(SVOp, DAG);
11354
11355 // If we have SSSE3, and all words of the result are from 1 input vector,
11356 // case 2 is generated, otherwise case 3 is generated. If no SSSE3
11357 // is present, fall back to case 4.
11358 if (Subtarget->hasSSSE3()) {
11359 SmallVector<SDValue,16> pshufbMask;
11360
11361 // If we have elements from both input vectors, set the high bit of the
11362 // shuffle mask element to zero out elements that come from V2 in the V1
11363 // mask, and elements that come from V1 in the V2 mask, so that the two
11364 // results can be OR'd together.
11365 bool TwoInputs = V1Used && V2Used;
11366 V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11367 if (!TwoInputs)
11368 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11369
11370 // Calculate the shuffle mask for the second input, shuffle it, and
11371 // OR it with the first shuffled input.
11372 CommuteVectorShuffleMask(MaskVals, 8);
11373 V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11374 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11375 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11376 }
11377
11378 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11379 // and update MaskVals with new element order.
11380 std::bitset<8> InOrder;
11381 if (BestLoQuad >= 0) {
11382 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11383 for (int i = 0; i != 4; ++i) {
11384 int idx = MaskVals[i];
11385 if (idx < 0) {
11386 InOrder.set(i);
11387 } else if ((idx / 4) == BestLoQuad) {
11388 MaskV[i] = idx & 3;
11389 InOrder.set(i);
11390 }
11391 }
11392 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11393 &MaskV[0]);
11394
11395 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11396 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11397 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11398 NewV.getOperand(0),
11399 getShufflePSHUFLWImmediate(SVOp), DAG);
11400 }
11401 }
11402
11403 // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11404 // and update MaskVals with the new element order.
11405 if (BestHiQuad >= 0) {
11406 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11407 for (unsigned i = 4; i != 8; ++i) {
11408 int idx = MaskVals[i];
11409 if (idx < 0) {
11410 InOrder.set(i);
11411 } else if ((idx / 4) == BestHiQuad) {
11412 MaskV[i] = (idx & 3) + 4;
11413 InOrder.set(i);
11414 }
11415 }
11416 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11417 &MaskV[0]);
11418
11419 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11420 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11421 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11422 NewV.getOperand(0),
11423 getShufflePSHUFHWImmediate(SVOp), DAG);
11424 }
11425 }
11426
11427 // In case BestHi & BestLo were both -1, which means each quadword has a word
11428 // from each of the four input quadwords, calculate the InOrder bitvector now
11429 // before falling through to the insert/extract cleanup.
11430 if (BestLoQuad == -1 && BestHiQuad == -1) {
11431 NewV = V1;
11432 for (int i = 0; i != 8; ++i)
11433 if (MaskVals[i] < 0 || MaskVals[i] == i)
11434 InOrder.set(i);
11435 }
11436
11437 // The other elements are put in the right place using pextrw and pinsrw.
11438 for (unsigned i = 0; i != 8; ++i) {
11439 if (InOrder[i])
11440 continue;
11441 int EltIdx = MaskVals[i];
11442 if (EltIdx < 0)
11443 continue;
11444 SDValue ExtOp = (EltIdx < 8) ?
11445 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11446 DAG.getIntPtrConstant(EltIdx)) :
11447 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11448 DAG.getIntPtrConstant(EltIdx - 8));
11449 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11450 DAG.getIntPtrConstant(i));
11451 }
11452 return NewV;
11453}
11454
11455/// \brief v16i16 shuffles
11456///
11457/// FIXME: We only support generation of a single pshufb currently. We can
11458/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11459/// well (e.g 2 x pshufb + 1 x por).
11460static SDValue
11461LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11462 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11463 SDValue V1 = SVOp->getOperand(0);
11464 SDValue V2 = SVOp->getOperand(1);
11465 SDLoc dl(SVOp);
11466
11467 if (V2.getOpcode() != ISD::UNDEF)
11468 return SDValue();
11469
11470 SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11471 return getPSHUFB(MaskVals, V1, dl, DAG);
11472}
11473
11474// v16i8 shuffles - Prefer shuffles in the following order:
11475// 1. [ssse3] 1 x pshufb
11476// 2. [ssse3] 2 x pshufb + 1 x por
11477// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
11478static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11479 const X86Subtarget* Subtarget,
11480 SelectionDAG &DAG) {
11481 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11482 SDValue V1 = SVOp->getOperand(0);
11483 SDValue V2 = SVOp->getOperand(1);
11484 SDLoc dl(SVOp);
11485 ArrayRef<int> MaskVals = SVOp->getMask();
11486
11487 // Promote splats to a larger type which usually leads to more efficient code.
11488 // FIXME: Is this true if pshufb is available?
11489 if (SVOp->isSplat())
11490 return PromoteSplat(SVOp, DAG);
11491
11492 // If we have SSSE3, case 1 is generated when all result bytes come from
11493 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
11494 // present, fall back to case 3.
11495
11496 // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11497 if (Subtarget->hasSSSE3()) {
11498 SmallVector<SDValue,16> pshufbMask;
11499
11500 // If all result elements are from one input vector, then only translate
11501 // undef mask values to 0x80 (zero out result) in the pshufb mask.
11502 //
11503 // Otherwise, we have elements from both input vectors, and must zero out
11504 // elements that come from V2 in the first mask, and V1 in the second mask
11505 // so that we can OR them together.
11506 for (unsigned i = 0; i != 16; ++i) {
11507 int EltIdx = MaskVals[i];
11508 if (EltIdx < 0 || EltIdx >= 16)
11509 EltIdx = 0x80;
11510 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11511 }
11512 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11513 DAG.getNode(ISD::BUILD_VECTOR, dl,
11514 MVT::v16i8, pshufbMask));
11515
11516 // As PSHUFB will zero elements with negative indices, it's safe to ignore
11517 // the 2nd operand if it's undefined or zero.
11518 if (V2.getOpcode() == ISD::UNDEF ||
11519 ISD::isBuildVectorAllZeros(V2.getNode()))
11520 return V1;
11521
11522 // Calculate the shuffle mask for the second input, shuffle it, and
11523 // OR it with the first shuffled input.
11524 pshufbMask.clear();
11525 for (unsigned i = 0; i != 16; ++i) {
11526 int EltIdx = MaskVals[i];
11527 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11528 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11529 }
11530 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11531 DAG.getNode(ISD::BUILD_VECTOR, dl,
11532 MVT::v16i8, pshufbMask));
11533 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11534 }
11535
11536 // No SSSE3 - Calculate in place words and then fix all out of place words
11537 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
11538 // the 16 different words that comprise the two doublequadword input vectors.
11539 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11540 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11541 SDValue NewV = V1;
11542 for (int i = 0; i != 8; ++i) {
11543 int Elt0 = MaskVals[i*2];
11544 int Elt1 = MaskVals[i*2+1];
11545
11546 // This word of the result is all undef, skip it.
11547 if (Elt0 < 0 && Elt1 < 0)
11548 continue;
11549
11550 // This word of the result is already in the correct place, skip it.
11551 if ((Elt0 == i*2) && (Elt1 == i*2+1))
11552 continue;
11553
11554 SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11555 SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11556 SDValue InsElt;
11557
11558 // If Elt0 and Elt1 are defined, are consecutive, and can be load
11559 // using a single extract together, load it and store it.
11560 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11561 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11562 DAG.getIntPtrConstant(Elt1 / 2));
11563 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11564 DAG.getIntPtrConstant(i));
11565 continue;
11566 }
11567
11568 // If Elt1 is defined, extract it from the appropriate source. If the
11569 // source byte is not also odd, shift the extracted word left 8 bits
11570 // otherwise clear the bottom 8 bits if we need to do an or.
11571 if (Elt1 >= 0) {
11572 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11573 DAG.getIntPtrConstant(Elt1 / 2));
11574 if ((Elt1 & 1) == 0)
11575 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11576 DAG.getConstant(8,
11577 TLI.getShiftAmountTy(InsElt.getValueType())));
11578 else if (Elt0 >= 0)
11579 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11580 DAG.getConstant(0xFF00, MVT::i16));
11581 }
11582 // If Elt0 is defined, extract it from the appropriate source. If the
11583 // source byte is not also even, shift the extracted word right 8 bits. If
11584 // Elt1 was also defined, OR the extracted values together before
11585 // inserting them in the result.
11586 if (Elt0 >= 0) {
11587 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11588 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11589 if ((Elt0 & 1) != 0)
11590 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11591 DAG.getConstant(8,
11592 TLI.getShiftAmountTy(InsElt0.getValueType())));
11593 else if (Elt1 >= 0)
11594 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11595 DAG.getConstant(0x00FF, MVT::i16));
11596 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11597 : InsElt0;
11598 }
11599 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11600 DAG.getIntPtrConstant(i));
11601 }
11602 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11603}
11604
11605// v32i8 shuffles - Translate to VPSHUFB if possible.
11606static
11607SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11608 const X86Subtarget *Subtarget,
11609 SelectionDAG &DAG) {
11610 MVT VT = SVOp->getSimpleValueType(0);
11611 SDValue V1 = SVOp->getOperand(0);
11612 SDValue V2 = SVOp->getOperand(1);
11613 SDLoc dl(SVOp);
11614 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11615
11616 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11617 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11618 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11619
11620 // VPSHUFB may be generated if
11621 // (1) one of input vector is undefined or zeroinitializer.
11622 // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11623 // And (2) the mask indexes don't cross the 128-bit lane.
11624 if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11625 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11626 return SDValue();
11627
11628 if (V1IsAllZero && !V2IsAllZero) {
11629 CommuteVectorShuffleMask(MaskVals, 32);
11630 V1 = V2;
11631 }
11632 return getPSHUFB(MaskVals, V1, dl, DAG);
11633}
11634
11635/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11636/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11637/// done when every pair / quad of shuffle mask elements point to elements in
11638/// the right sequence. e.g.
11639/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11640static
11641SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11642 SelectionDAG &DAG) {
11643 MVT VT = SVOp->getSimpleValueType(0);
11644 SDLoc dl(SVOp);
11645 unsigned NumElems = VT.getVectorNumElements();
11646 MVT NewVT;
11647 unsigned Scale;
11648 switch (VT.SimpleTy) {
11649 default: llvm_unreachable("Unexpected!")::llvm::llvm_unreachable_internal("Unexpected!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11649)
;
11650 case MVT::v2i64:
11651 case MVT::v2f64:
11652 return SDValue(SVOp, 0);
11653 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
11654 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
11655 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
11656 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
11657 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11658 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
11659 }
11660
11661 SmallVector<int, 8> MaskVec;
11662 for (unsigned i = 0; i != NumElems; i += Scale) {
11663 int StartIdx = -1;
11664 for (unsigned j = 0; j != Scale; ++j) {
11665 int EltIdx = SVOp->getMaskElt(i+j);
11666 if (EltIdx < 0)
11667 continue;
11668 if (StartIdx < 0)
11669 StartIdx = (EltIdx / Scale);
11670 if (EltIdx != (int)(StartIdx*Scale + j))
11671 return SDValue();
11672 }
11673 MaskVec.push_back(StartIdx);
11674 }
11675
11676 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11677 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11678 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11679}
11680
11681/// getVZextMovL - Return a zero-extending vector move low node.
11682///
11683static SDValue getVZextMovL(MVT VT, MVT OpVT,
11684 SDValue SrcOp, SelectionDAG &DAG,
11685 const X86Subtarget *Subtarget, SDLoc dl) {
11686 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11687 LoadSDNode *LD = nullptr;
11688 if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11689 LD = dyn_cast<LoadSDNode>(SrcOp);
11690 if (!LD) {
11691 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11692 // instead.
11693 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11694 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11695 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11696 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11697 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11698 // PR2108
11699 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11700 return DAG.getNode(ISD::BITCAST, dl, VT,
11701 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11702 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11703 OpVT,
11704 SrcOp.getOperand(0)
11705 .getOperand(0))));
11706 }
11707 }
11708 }
11709
11710 return DAG.getNode(ISD::BITCAST, dl, VT,
11711 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11712 DAG.getNode(ISD::BITCAST, dl,
11713 OpVT, SrcOp)));
11714}
11715
11716/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11717/// which could not be matched by any known target speficic shuffle
11718static SDValue
11719LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11720
11721 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11722 if (NewOp.getNode())
11723 return NewOp;
11724
11725 MVT VT = SVOp->getSimpleValueType(0);
11726
11727 unsigned NumElems = VT.getVectorNumElements();
11728 unsigned NumLaneElems = NumElems / 2;
11729
11730 SDLoc dl(SVOp);
11731 MVT EltVT = VT.getVectorElementType();
11732 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11733 SDValue Output[2];
11734
11735 SmallVector<int, 16> Mask;
11736 for (unsigned l = 0; l < 2; ++l) {
11737 // Build a shuffle mask for the output, discovering on the fly which
11738 // input vectors to use as shuffle operands (recorded in InputUsed).
11739 // If building a suitable shuffle vector proves too hard, then bail
11740 // out with UseBuildVector set.
11741 bool UseBuildVector = false;
11742 int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11743 unsigned LaneStart = l * NumLaneElems;
11744 for (unsigned i = 0; i != NumLaneElems; ++i) {
11745 // The mask element. This indexes into the input.
11746 int Idx = SVOp->getMaskElt(i+LaneStart);
11747 if (Idx < 0) {
11748 // the mask element does not index into any input vector.
11749 Mask.push_back(-1);
11750 continue;
11751 }
11752
11753 // The input vector this mask element indexes into.
11754 int Input = Idx / NumLaneElems;
11755
11756 // Turn the index into an offset from the start of the input vector.
11757 Idx -= Input * NumLaneElems;
11758
11759 // Find or create a shuffle vector operand to hold this input.
11760 unsigned OpNo;
11761 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11762 if (InputUsed[OpNo] == Input)
11763 // This input vector is already an operand.
11764 break;
11765 if (InputUsed[OpNo] < 0) {
11766 // Create a new operand for this input vector.
11767 InputUsed[OpNo] = Input;
11768 break;
11769 }
11770 }
11771
11772 if (OpNo >= array_lengthof(InputUsed)) {
11773 // More than two input vectors used! Give up on trying to create a
11774 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
11775 UseBuildVector = true;
11776 break;
11777 }
11778
11779 // Add the mask index for the new shuffle vector.
11780 Mask.push_back(Idx + OpNo * NumLaneElems);
11781 }
11782
11783 if (UseBuildVector) {
11784 SmallVector<SDValue, 16> SVOps;
11785 for (unsigned i = 0; i != NumLaneElems; ++i) {
11786 // The mask element. This indexes into the input.
11787 int Idx = SVOp->getMaskElt(i+LaneStart);
11788 if (Idx < 0) {
11789 SVOps.push_back(DAG.getUNDEF(EltVT));
11790 continue;
11791 }
11792
11793 // The input vector this mask element indexes into.
11794 int Input = Idx / NumElems;
11795
11796 // Turn the index into an offset from the start of the input vector.
11797 Idx -= Input * NumElems;
11798
11799 // Extract the vector element by hand.
11800 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
11801 SVOp->getOperand(Input),
11802 DAG.getIntPtrConstant(Idx)));
11803 }
11804
11805 // Construct the output using a BUILD_VECTOR.
11806 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
11807 } else if (InputUsed[0] < 0) {
11808 // No input vectors were used! The result is undefined.
11809 Output[l] = DAG.getUNDEF(NVT);
11810 } else {
11811 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
11812 (InputUsed[0] % 2) * NumLaneElems,
11813 DAG, dl);
11814 // If only one input was used, use an undefined vector for the other.
11815 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
11816 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
11817 (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
11818 // At least one input vector was used. Create a new shuffle vector.
11819 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
11820 }
11821
11822 Mask.clear();
11823 }
11824
11825 // Concatenate the result back
11826 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
11827}
11828
11829/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
11830/// 4 elements, and match them with several different shuffle types.
11831static SDValue
11832LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11833 SDValue V1 = SVOp->getOperand(0);
11834 SDValue V2 = SVOp->getOperand(1);
11835 SDLoc dl(SVOp);
11836 MVT VT = SVOp->getSimpleValueType(0);
11837
11838 assert(VT.is128BitVector() && "Unsupported vector size")((VT.is128BitVector() && "Unsupported vector size") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unsupported vector size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11838, __PRETTY_FUNCTION__))
;
11839
11840 std::pair<int, int> Locs[4];
11841 int Mask1[] = { -1, -1, -1, -1 };
11842 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
11843
11844 unsigned NumHi = 0;
11845 unsigned NumLo = 0;
11846 for (unsigned i = 0; i != 4; ++i) {
11847 int Idx = PermMask[i];
11848 if (Idx < 0) {
11849 Locs[i] = std::make_pair(-1, -1);
11850 } else {
11851 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!")((Idx < 8 && "Invalid VECTOR_SHUFFLE index!") ? static_cast
<void> (0) : __assert_fail ("Idx < 8 && \"Invalid VECTOR_SHUFFLE index!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 11851, __PRETTY_FUNCTION__))
;
11852 if (Idx < 4) {
11853 Locs[i] = std::make_pair(0, NumLo);
11854 Mask1[NumLo] = Idx;
11855 NumLo++;
11856 } else {
11857 Locs[i] = std::make_pair(1, NumHi);
11858 if (2+NumHi < 4)
11859 Mask1[2+NumHi] = Idx;
11860 NumHi++;
11861 }
11862 }
11863 }
11864
11865 if (NumLo <= 2 && NumHi <= 2) {
11866 // If no more than two elements come from either vector. This can be
11867 // implemented with two shuffles. First shuffle gather the elements.
11868 // The second shuffle, which takes the first shuffle as both of its
11869 // vector operands, put the elements into the right order.
11870 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11871
11872 int Mask2[] = { -1, -1, -1, -1 };
11873
11874 for (unsigned i = 0; i != 4; ++i)
11875 if (Locs[i].first != -1) {
11876 unsigned Idx = (i < 2) ? 0 : 4;
11877 Idx += Locs[i].first * 2 + Locs[i].second;
11878 Mask2[i] = Idx;
11879 }
11880
11881 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
11882 }
11883
11884 if (NumLo == 3 || NumHi == 3) {
11885 // Otherwise, we must have three elements from one vector, call it X, and
11886 // one element from the other, call it Y. First, use a shufps to build an
11887 // intermediate vector with the one element from Y and the element from X
11888 // that will be in the same half in the final destination (the indexes don't
11889 // matter). Then, use a shufps to build the final vector, taking the half
11890 // containing the element from Y from the intermediate, and the other half
11891 // from X.
11892 if (NumHi == 3) {
11893 // Normalize it so the 3 elements come from V1.
11894 CommuteVectorShuffleMask(PermMask, 4);
11895 std::swap(V1, V2);
11896 }
11897
11898 // Find the element from V2.
11899 unsigned HiIndex;
11900 for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
11901 int Val = PermMask[HiIndex];
11902 if (Val < 0)
11903 continue;
11904 if (Val >= 4)
11905 break;
11906 }
11907
11908 Mask1[0] = PermMask[HiIndex];
11909 Mask1[1] = -1;
11910 Mask1[2] = PermMask[HiIndex^1];
11911 Mask1[3] = -1;
11912 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11913
11914 if (HiIndex >= 2) {
11915 Mask1[0] = PermMask[0];
11916 Mask1[1] = PermMask[1];
11917 Mask1[2] = HiIndex & 1 ? 6 : 4;
11918 Mask1[3] = HiIndex & 1 ? 4 : 6;
11919 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11920 }
11921
11922 Mask1[0] = HiIndex & 1 ? 2 : 0;
11923 Mask1[1] = HiIndex & 1 ? 0 : 2;
11924 Mask1[2] = PermMask[2];
11925 Mask1[3] = PermMask[3];
11926 if (Mask1[2] >= 0)
11927 Mask1[2] += 4;
11928 if (Mask1[3] >= 0)
11929 Mask1[3] += 4;
11930 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
11931 }
11932
11933 // Break it into (shuffle shuffle_hi, shuffle_lo).
11934 int LoMask[] = { -1, -1, -1, -1 };
11935 int HiMask[] = { -1, -1, -1, -1 };
11936
11937 int *MaskPtr = LoMask;
11938 unsigned MaskIdx = 0;
11939 unsigned LoIdx = 0;
11940 unsigned HiIdx = 2;
11941 for (unsigned i = 0; i != 4; ++i) {
11942 if (i == 2) {
11943 MaskPtr = HiMask;
11944 MaskIdx = 1;
11945 LoIdx = 0;
11946 HiIdx = 2;
11947 }
11948 int Idx = PermMask[i];
11949 if (Idx < 0) {
11950 Locs[i] = std::make_pair(-1, -1);
11951 } else if (Idx < 4) {
11952 Locs[i] = std::make_pair(MaskIdx, LoIdx);
11953 MaskPtr[LoIdx] = Idx;
11954 LoIdx++;
11955 } else {
11956 Locs[i] = std::make_pair(MaskIdx, HiIdx);
11957 MaskPtr[HiIdx] = Idx;
11958 HiIdx++;
11959 }
11960 }
11961
11962 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
11963 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
11964 int MaskOps[] = { -1, -1, -1, -1 };
11965 for (unsigned i = 0; i != 4; ++i)
11966 if (Locs[i].first != -1)
11967 MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
11968 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
11969}
11970
11971static bool MayFoldVectorLoad(SDValue V) {
11972 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
11973 V = V.getOperand(0);
11974
11975 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
11976 V = V.getOperand(0);
11977 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
11978 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
11979 // BUILD_VECTOR (load), undef
11980 V = V.getOperand(0);
11981
11982 return MayFoldLoad(V);
11983}
11984
11985static
11986SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
11987 MVT VT = Op.getSimpleValueType();
11988
11989 // Canonizalize to v2f64.
11990 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
11991 return DAG.getNode(ISD::BITCAST, dl, VT,
11992 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
11993 V1, DAG));
11994}
11995
11996static
11997SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
11998 bool HasSSE2) {
11999 SDValue V1 = Op.getOperand(0);
12000 SDValue V2 = Op.getOperand(1);
12001 MVT VT = Op.getSimpleValueType();
12002
12003 assert(VT != MVT::v2i64 && "unsupported shuffle type")((VT != MVT::v2i64 && "unsupported shuffle type") ? static_cast
<void> (0) : __assert_fail ("VT != MVT::v2i64 && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12003, __PRETTY_FUNCTION__))
;
12004
12005 if (HasSSE2 && VT == MVT::v2f64)
12006 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12007
12008 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12009 return DAG.getNode(ISD::BITCAST, dl, VT,
12010 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12011 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12012 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12013}
12014
12015static
12016SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12017 SDValue V1 = Op.getOperand(0);
12018 SDValue V2 = Op.getOperand(1);
12019 MVT VT = Op.getSimpleValueType();
12020
12021 assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&(((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v4f32) && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12022, __PRETTY_FUNCTION__))
12022 "unsupported shuffle type")(((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v4f32) && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12022, __PRETTY_FUNCTION__))
;
12023
12024 if (V2.getOpcode() == ISD::UNDEF)
12025 V2 = V1;
12026
12027 // v4i32 or v4f32
12028 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12029}
12030
12031static
12032SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12033 SDValue V1 = Op.getOperand(0);
12034 SDValue V2 = Op.getOperand(1);
12035 MVT VT = Op.getSimpleValueType();
12036 unsigned NumElems = VT.getVectorNumElements();
12037
12038 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12039 // operand of these instructions is only memory, so check if there's a
12040 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12041 // same masks.
12042 bool CanFoldLoad = false;
12043
12044 // Trivial case, when V2 comes from a load.
12045 if (MayFoldVectorLoad(V2))
12046 CanFoldLoad = true;
12047
12048 // When V1 is a load, it can be folded later into a store in isel, example:
12049 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12050 // turns into:
12051 // (MOVLPSmr addr:$src1, VR128:$src2)
12052 // So, recognize this potential and also use MOVLPS or MOVLPD
12053 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12054 CanFoldLoad = true;
12055
12056 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12057 if (CanFoldLoad) {
12058 if (HasSSE2 && NumElems == 2)
12059 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12060
12061 if (NumElems == 4)
12062 // If we don't care about the second element, proceed to use movss.
12063 if (SVOp->getMaskElt(1) != -1)
12064 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12065 }
12066
12067 // movl and movlp will both match v2i64, but v2i64 is never matched by
12068 // movl earlier because we make it strict to avoid messing with the movlp load
12069 // folding logic (see the code above getMOVLP call). Match it here then,
12070 // this is horrible, but will stay like this until we move all shuffle
12071 // matching to x86 specific nodes. Note that for the 1st condition all
12072 // types are matched with movsd.
12073 if (HasSSE2) {
12074 // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12075 // as to remove this logic from here, as much as possible
12076 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12077 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12078 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12079 }
12080
12081 assert(VT != MVT::v4i32 && "unsupported shuffle type")((VT != MVT::v4i32 && "unsupported shuffle type") ? static_cast
<void> (0) : __assert_fail ("VT != MVT::v4i32 && \"unsupported shuffle type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12081, __PRETTY_FUNCTION__))
;
12082
12083 // Invert the operand order and use SHUFPS to match it.
12084 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12085 getShuffleSHUFImmediate(SVOp), DAG);
12086}
12087
12088static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12089 SelectionDAG &DAG) {
12090 SDLoc dl(Load);
12091 MVT VT = Load->getSimpleValueType(0);
12092 MVT EVT = VT.getVectorElementType();
12093 SDValue Addr = Load->getOperand(1);
12094 SDValue NewAddr = DAG.getNode(
12095 ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12096 DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12097
12098 SDValue NewLoad =
12099 DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12100 DAG.getMachineFunction().getMachineMemOperand(
12101 Load->getMemOperand(), 0, EVT.getStoreSize()));
12102 return NewLoad;
12103}
12104
12105// It is only safe to call this function if isINSERTPSMask is true for
12106// this shufflevector mask.
12107static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12108 SelectionDAG &DAG) {
12109 // Generate an insertps instruction when inserting an f32 from memory onto a
12110 // v4f32 or when copying a member from one v4f32 to another.
12111 // We also use it for transferring i32 from one register to another,
12112 // since it simply copies the same bits.
12113 // If we're transferring an i32 from memory to a specific element in a
12114 // register, we output a generic DAG that will match the PINSRD
12115 // instruction.
12116 MVT VT = SVOp->getSimpleValueType(0);
12117 MVT EVT = VT.getVectorElementType();
12118 SDValue V1 = SVOp->getOperand(0);
12119 SDValue V2 = SVOp->getOperand(1);
12120 auto Mask = SVOp->getMask();
12121 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&(((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"unsupported vector type for insertps/pinsrd\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12122, __PRETTY_FUNCTION__))
12122 "unsupported vector type for insertps/pinsrd")(((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"unsupported vector type for insertps/pinsrd\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12122, __PRETTY_FUNCTION__))
;
12123
12124 auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12125 auto FromV2Predicate = [](const int &i) { return i >= 4; };
12126 int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12127
12128 SDValue From;
12129 SDValue To;
12130 unsigned DestIndex;
12131 if (FromV1 == 1) {
12132 From = V1;
12133 To = V2;
12134 DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12135 Mask.begin();
12136
12137 // If we have 1 element from each vector, we have to check if we're
12138 // changing V1's element's place. If so, we're done. Otherwise, we
12139 // should assume we're changing V2's element's place and behave
12140 // accordingly.
12141 int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12142 assert(DestIndex <= INT32_MAX && "truncated destination index")((DestIndex <= (2147483647) && "truncated destination index"
) ? static_cast<void> (0) : __assert_fail ("DestIndex <= (2147483647) && \"truncated destination index\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12142, __PRETTY_FUNCTION__))
;
12143 if (FromV1 == FromV2 &&
12144 static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12145 From = V2;
12146 To = V1;
12147 DestIndex =
12148 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12149 }
12150 } else {
12151 assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))
12152 "More than one element from V1 and from V2, or no elements from one "((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))
12153 "of the vectors. This case should not have returned true from "((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))
12154 "isINSERTPSMask")((std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) ==
1 && "More than one element from V1 and from V2, or no elements from one "
"of the vectors. This case should not have returned true from "
"isINSERTPSMask") ? static_cast<void> (0) : __assert_fail
("std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && \"More than one element from V1 and from V2, or no elements from one \" \"of the vectors. This case should not have returned true from \" \"isINSERTPSMask\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12154, __PRETTY_FUNCTION__))
;
12155 From = V2;
12156 To = V1;
12157 DestIndex =
12158 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12159 }
12160
12161 // Get an index into the source vector in the range [0,4) (the mask is
12162 // in the range [0,8) because it can address V1 and V2)
12163 unsigned SrcIndex = Mask[DestIndex] % 4;
12164 if (MayFoldLoad(From)) {
12165 // Trivial case, when From comes from a load and is only used by the
12166 // shuffle. Make it use insertps from the vector that we need from that
12167 // load.
12168 SDValue NewLoad =
12169 NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12170 if (!NewLoad.getNode())
12171 return SDValue();
12172
12173 if (EVT == MVT::f32) {
12174 // Create this as a scalar to vector to match the instruction pattern.
12175 SDValue LoadScalarToVector =
12176 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12177 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12178 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12179 InsertpsMask);
12180 } else { // EVT == MVT::i32
12181 // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12182 // instruction, to match the PINSRD instruction, which loads an i32 to a
12183 // certain vector element.
12184 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12185 DAG.getConstant(DestIndex, MVT::i32));
12186 }
12187 }
12188
12189 // Vector-element-to-vector
12190 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12191 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12192}
12193
12194// Reduce a vector shuffle to zext.
12195static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12196 SelectionDAG &DAG) {
12197 // PMOVZX is only available from SSE41.
12198 if (!Subtarget->hasSSE41())
12199 return SDValue();
12200
12201 MVT VT = Op.getSimpleValueType();
12202
12203 // Only AVX2 support 256-bit vector integer extending.
12204 if (!Subtarget->hasInt256() && VT.is256BitVector())
12205 return SDValue();
12206
12207 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12208 SDLoc DL(Op);
12209 SDValue V1 = Op.getOperand(0);
12210 SDValue V2 = Op.getOperand(1);
12211 unsigned NumElems = VT.getVectorNumElements();
12212
12213 // Extending is an unary operation and the element type of the source vector
12214 // won't be equal to or larger than i64.
12215 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12216 VT.getVectorElementType() == MVT::i64)
12217 return SDValue();
12218
12219 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12220 unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12221 while ((1U << Shift) < NumElems) {
12222 if (SVOp->getMaskElt(1U << Shift) == 1)
12223 break;
12224 Shift += 1;
12225 // The maximal ratio is 8, i.e. from i8 to i64.
12226 if (Shift > 3)
12227 return SDValue();
12228 }
12229
12230 // Check the shuffle mask.
12231 unsigned Mask = (1U << Shift) - 1;
12232 for (unsigned i = 0; i != NumElems; ++i) {
12233 int EltIdx = SVOp->getMaskElt(i);
12234 if ((i & Mask) != 0 && EltIdx != -1)
12235 return SDValue();
12236 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12237 return SDValue();
12238 }
12239
12240 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12241 MVT NeVT = MVT::getIntegerVT(NBits);
12242 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12243
12244 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12245 return SDValue();
12246
12247 return DAG.getNode(ISD::BITCAST, DL, VT,
12248 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12249}
12250
12251static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12252 SelectionDAG &DAG) {
12253 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12254 MVT VT = Op.getSimpleValueType();
12255 SDLoc dl(Op);
12256 SDValue V1 = Op.getOperand(0);
12257 SDValue V2 = Op.getOperand(1);
12258
12259 if (isZeroShuffle(SVOp))
12260 return getZeroVector(VT, Subtarget, DAG, dl);
12261
12262 // Handle splat operations
12263 if (SVOp->isSplat()) {
12264 // Use vbroadcast whenever the splat comes from a foldable load
12265 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12266 if (Broadcast.getNode())
12267 return Broadcast;
12268 }
12269
12270 // Check integer expanding shuffles.
12271 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12272 if (NewOp.getNode())
12273 return NewOp;
12274
12275 // If the shuffle can be profitably rewritten as a narrower shuffle, then
12276 // do it!
12277 if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12278 VT == MVT::v32i8) {
12279 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12280 if (NewOp.getNode())
12281 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12282 } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12283 // FIXME: Figure out a cleaner way to do this.
12284 if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12285 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12286 if (NewOp.getNode()) {
12287 MVT NewVT = NewOp.getSimpleValueType();
12288 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12289 NewVT, true, false))
12290 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12291 dl);
12292 }
12293 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12294 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12295 if (NewOp.getNode()) {
12296 MVT NewVT = NewOp.getSimpleValueType();
12297 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12298 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12299 dl);
12300 }
12301 }
12302 }
12303 return SDValue();
12304}
12305
12306SDValue
12307X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12308 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12309 SDValue V1 = Op.getOperand(0);
12310 SDValue V2 = Op.getOperand(1);
12311 MVT VT = Op.getSimpleValueType();
12312 SDLoc dl(Op);
12313 unsigned NumElems = VT.getVectorNumElements();
12314 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12315 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12316 bool V1IsSplat = false;
12317 bool V2IsSplat = false;
12318 bool HasSSE2 = Subtarget->hasSSE2();
12319 bool HasFp256 = Subtarget->hasFp256();
12320 bool HasInt256 = Subtarget->hasInt256();
12321 MachineFunction &MF = DAG.getMachineFunction();
12322 bool OptForSize = MF.getFunction()->getAttributes().
12323 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12324
12325 // Check if we should use the experimental vector shuffle lowering. If so,
12326 // delegate completely to that code path.
12327 if (ExperimentalVectorShuffleLowering)
12328 return lowerVectorShuffle(Op, Subtarget, DAG);
12329
12330 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles")((VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() != 64 && \"Can't lower MMX shuffles\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12330, __PRETTY_FUNCTION__))
;
12331
12332 if (V1IsUndef && V2IsUndef)
12333 return DAG.getUNDEF(VT);
12334
12335 // When we create a shuffle node we put the UNDEF node to second operand,
12336 // but in some cases the first operand may be transformed to UNDEF.
12337 // In this case we should just commute the node.
12338 if (V1IsUndef)
12339 return DAG.getCommutedVectorShuffle(*SVOp);
12340
12341 // Vector shuffle lowering takes 3 steps:
12342 //
12343 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12344 // narrowing and commutation of operands should be handled.
12345 // 2) Matching of shuffles with known shuffle masks to x86 target specific
12346 // shuffle nodes.
12347 // 3) Rewriting of unmatched masks into new generic shuffle operations,
12348 // so the shuffle can be broken into other shuffles and the legalizer can
12349 // try the lowering again.
12350 //
12351 // The general idea is that no vector_shuffle operation should be left to
12352 // be matched during isel, all of them must be converted to a target specific
12353 // node here.
12354
12355 // Normalize the input vectors. Here splats, zeroed vectors, profitable
12356 // narrowing and commutation of operands should be handled. The actual code
12357 // doesn't include all of those, work in progress...
12358 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12359 if (NewOp.getNode())
12360 return NewOp;
12361
12362 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12363
12364 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12365 // unpckh_undef). Only use pshufd if speed is more important than size.
12366 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12367 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12368 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12369 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12370
12371 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12372 V2IsUndef && MayFoldVectorLoad(V1))
12373 return getMOVDDup(Op, dl, V1, DAG);
12374
12375 if (isMOVHLPS_v_undef_Mask(M, VT))
12376 return getMOVHighToLow(Op, dl, DAG);
12377
12378 // Use to match splats
12379 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12380 (VT == MVT::v2f64 || VT == MVT::v2i64))
12381 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12382
12383 if (isPSHUFDMask(M, VT)) {
12384 // The actual implementation will match the mask in the if above and then
12385 // during isel it can match several different instructions, not only pshufd
12386 // as its name says, sad but true, emulate the behavior for now...
12387 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12388 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12389
12390 unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12391
12392 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12393 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12394
12395 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12396 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12397 DAG);
12398
12399 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12400 TargetMask, DAG);
12401 }
12402
12403 if (isPALIGNRMask(M, VT, Subtarget))
12404 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12405 getShufflePALIGNRImmediate(SVOp),
12406 DAG);
12407
12408 if (isVALIGNMask(M, VT, Subtarget))
12409 return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12410 getShuffleVALIGNImmediate(SVOp),
12411 DAG);
12412
12413 // Check if this can be converted into a logical shift.
12414 bool isLeft = false;
12415 unsigned ShAmt = 0;
12416 SDValue ShVal;
12417 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12418 if (isShift && ShVal.hasOneUse()) {
12419 // If the shifted value has multiple uses, it may be cheaper to use
12420 // v_set0 + movlhps or movhlps, etc.
12421 MVT EltVT = VT.getVectorElementType();
12422 ShAmt *= EltVT.getSizeInBits();
12423 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12424 }
12425
12426 if (isMOVLMask(M, VT)) {
12427 if (ISD::isBuildVectorAllZeros(V1.getNode()))
12428 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12429 if (!isMOVLPMask(M, VT)) {
12430 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12431 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12432
12433 if (VT == MVT::v4i32 || VT == MVT::v4f32)
12434 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12435 }
12436 }
12437
12438 // FIXME: fold these into legal mask.
12439 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12440 return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12441
12442 if (isMOVHLPSMask(M, VT))
12443 return getMOVHighToLow(Op, dl, DAG);
12444
12445 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12446 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12447
12448 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12449 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12450
12451 if (isMOVLPMask(M, VT))
12452 return getMOVLP(Op, dl, DAG, HasSSE2);
12453
12454 if (ShouldXformToMOVHLPS(M, VT) ||
12455 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12456 return DAG.getCommutedVectorShuffle(*SVOp);
12457
12458 if (isShift) {
12459 // No better options. Use a vshldq / vsrldq.
12460 MVT EltVT = VT.getVectorElementType();
12461 ShAmt *= EltVT.getSizeInBits();
12462 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12463 }
12464
12465 bool Commuted = false;
12466 // FIXME: This should also accept a bitcast of a splat? Be careful, not
12467 // 1,1,1,1 -> v8i16 though.
12468 BitVector UndefElements;
12469 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12470 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12471 V1IsSplat = true;
12472 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12473 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12474 V2IsSplat = true;
12475
12476 // Canonicalize the splat or undef, if present, to be on the RHS.
12477 if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12478 CommuteVectorShuffleMask(M, NumElems);
12479 std::swap(V1, V2);
12480 std::swap(V1IsSplat, V2IsSplat);
12481 Commuted = true;
12482 }
12483
12484 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12485 // Shuffling low element of v1 into undef, just return v1.
12486 if (V2IsUndef)
12487 return V1;
12488 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12489 // the instruction selector will not match, so get a canonical MOVL with
12490 // swapped operands to undo the commute.
12491 return getMOVL(DAG, dl, VT, V2, V1);
12492 }
12493
12494 if (isUNPCKLMask(M, VT, HasInt256))
12495 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12496
12497 if (isUNPCKHMask(M, VT, HasInt256))
12498 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12499
12500 if (V2IsSplat) {
12501 // Normalize mask so all entries that point to V2 points to its first
12502 // element then try to match unpck{h|l} again. If match, return a
12503 // new vector_shuffle with the corrected mask.p
12504 SmallVector<int, 8> NewMask(M.begin(), M.end());
12505 NormalizeMask(NewMask, NumElems);
12506 if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12507 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12508 if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12509 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12510 }
12511
12512 if (Commuted) {
12513 // Commute is back and try unpck* again.
12514 // FIXME: this seems wrong.
12515 CommuteVectorShuffleMask(M, NumElems);
12516 std::swap(V1, V2);
12517 std::swap(V1IsSplat, V2IsSplat);
12518
12519 if (isUNPCKLMask(M, VT, HasInt256))
12520 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12521
12522 if (isUNPCKHMask(M, VT, HasInt256))
12523 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12524 }
12525
12526 // Normalize the node to match x86 shuffle ops if needed
12527 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12528 return DAG.getCommutedVectorShuffle(*SVOp);
12529
12530 // The checks below are all present in isShuffleMaskLegal, but they are
12531 // inlined here right now to enable us to directly emit target specific
12532 // nodes, and remove one by one until they don't return Op anymore.
12533
12534 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12535 SVOp->getSplatIndex() == 0 && V2IsUndef) {
12536 if (VT == MVT::v2f64 || VT == MVT::v2i64)
12537 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12538 }
12539
12540 if (isPSHUFHWMask(M, VT, HasInt256))
12541 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12542 getShufflePSHUFHWImmediate(SVOp),
12543 DAG);
12544
12545 if (isPSHUFLWMask(M, VT, HasInt256))
12546 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12547 getShufflePSHUFLWImmediate(SVOp),
12548 DAG);
12549
12550 unsigned MaskValue;
12551 if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12552 &MaskValue))
12553 return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12554
12555 if (isSHUFPMask(M, VT))
12556 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12557 getShuffleSHUFImmediate(SVOp), DAG);
12558
12559 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12560 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12561 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12562 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12563
12564 //===--------------------------------------------------------------------===//
12565 // Generate target specific nodes for 128 or 256-bit shuffles only
12566 // supported in the AVX instruction set.
12567 //
12568
12569 // Handle VMOVDDUPY permutations
12570 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12571 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12572
12573 // Handle VPERMILPS/D* permutations
12574 if (isVPERMILPMask(M, VT)) {
12575 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12576 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12577 getShuffleSHUFImmediate(SVOp), DAG);
12578 return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12579 getShuffleSHUFImmediate(SVOp), DAG);
12580 }
12581
12582 unsigned Idx;
12583 if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12584 return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12585 Idx*(NumElems/2), DAG, dl);
12586
12587 // Handle VPERM2F128/VPERM2I128 permutations
12588 if (isVPERM2X128Mask(M, VT, HasFp256))
12589 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12590 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12591
12592 if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12593 return getINSERTPS(SVOp, dl, DAG);
12594
12595 unsigned Imm8;
12596 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12597 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12598
12599 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12600 VT.is512BitVector()) {
12601 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12602 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12603 SmallVector<SDValue, 16> permclMask;
12604 for (unsigned i = 0; i != NumElems; ++i) {
12605 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12606 }
12607
12608 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12609 if (V2IsUndef)
12610 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12611 return DAG.getNode(X86ISD::VPERMV, dl, VT,
12612 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12613 return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12614 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12615 }
12616
12617 //===--------------------------------------------------------------------===//
12618 // Since no target specific shuffle was selected for this generic one,
12619 // lower it into other known shuffles. FIXME: this isn't true yet, but
12620 // this is the plan.
12621 //
12622
12623 // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12624 if (VT == MVT::v8i16) {
12625 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12626 if (NewOp.getNode())
12627 return NewOp;
12628 }
12629
12630 if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12631 SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12632 if (NewOp.getNode())
12633 return NewOp;
12634 }
12635
12636 if (VT == MVT::v16i8) {
12637 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12638 if (NewOp.getNode())
12639 return NewOp;
12640 }
12641
12642 if (VT == MVT::v32i8) {
12643 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12644 if (NewOp.getNode())
12645 return NewOp;
12646 }
12647
12648 // Handle all 128-bit wide vectors with 4 elements, and match them with
12649 // several different shuffle types.
12650 if (NumElems == 4 && VT.is128BitVector())
12651 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12652
12653 // Handle general 256-bit shuffles
12654 if (VT.is256BitVector())
12655 return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12656
12657 return SDValue();
12658}
12659
12660// This function assumes its argument is a BUILD_VECTOR of constants or
12661// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12662// true.
12663static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12664 unsigned &MaskValue) {
12665 MaskValue = 0;
12666 unsigned NumElems = BuildVector->getNumOperands();
12667 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12668 unsigned NumLanes = (NumElems - 1) / 8 + 1;
12669 unsigned NumElemsInLane = NumElems / NumLanes;
12670
12671 // Blend for v16i16 should be symetric for the both lanes.
12672 for (unsigned i = 0; i < NumElemsInLane; ++i) {
12673 SDValue EltCond = BuildVector->getOperand(i);
12674 SDValue SndLaneEltCond =
12675 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12676
12677 int Lane1Cond = -1, Lane2Cond = -1;
12678 if (isa<ConstantSDNode>(EltCond))
12679 Lane1Cond = !isZero(EltCond);
12680 if (isa<ConstantSDNode>(SndLaneEltCond))
12681 Lane2Cond = !isZero(SndLaneEltCond);
12682
12683 if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12684 // Lane1Cond != 0, means we want the first argument.
12685 // Lane1Cond == 0, means we want the second argument.
12686 // The encoding of this argument is 0 for the first argument, 1
12687 // for the second. Therefore, invert the condition.
12688 MaskValue |= !Lane1Cond << i;
12689 else if (Lane1Cond < 0)
12690 MaskValue |= !Lane2Cond << i;
12691 else
12692 return false;
12693 }
12694 return true;
12695}
12696
12697/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12698/// instruction.
12699static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12700 SelectionDAG &DAG) {
12701 SDValue Cond = Op.getOperand(0);
12702 SDValue LHS = Op.getOperand(1);
12703 SDValue RHS = Op.getOperand(2);
12704 SDLoc dl(Op);
12705 MVT VT = Op.getSimpleValueType();
12706 MVT EltVT = VT.getVectorElementType();
12707 unsigned NumElems = VT.getVectorNumElements();
12708
12709 // There is no blend with immediate in AVX-512.
12710 if (VT.is512BitVector())
12711 return SDValue();
12712
12713 if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12714 return SDValue();
12715 if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12716 return SDValue();
12717
12718 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12719 return SDValue();
12720
12721 // Check the mask for BLEND and build the value.
12722 unsigned MaskValue = 0;
12723 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12724 return SDValue();
12725
12726 // Convert i32 vectors to floating point if it is not AVX2.
12727 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12728 MVT BlendVT = VT;
12729 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12730 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12731 NumElems);
12732 LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12733 RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12734 }
12735
12736 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12737 DAG.getConstant(MaskValue, MVT::i32));
12738 return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12739}
12740
12741SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12742 // A vselect where all conditions and data are constants can be optimized into
12743 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12744 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12745 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12746 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12747 return SDValue();
12748
12749 SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12750 if (BlendOp.getNode())
12751 return BlendOp;
12752
12753 // Some types for vselect were previously set to Expand, not Legal or
12754 // Custom. Return an empty SDValue so we fall-through to Expand, after
12755 // the Custom lowering phase.
12756 MVT VT = Op.getSimpleValueType();
12757 switch (VT.SimpleTy) {
12758 default:
12759 break;
12760 case MVT::v8i16:
12761 case MVT::v16i16:
12762 if (Subtarget->hasBWI() && Subtarget->hasVLX())
12763 break;
12764 return SDValue();
12765 }
12766
12767 // We couldn't create a "Blend with immediate" node.
12768 // This node should still be legal, but we'll have to emit a blendv*
12769 // instruction.
12770 return Op;
12771}
12772
12773static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12774 MVT VT = Op.getSimpleValueType();
12775 SDLoc dl(Op);
12776
12777 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12778 return SDValue();
12779
12780 if (VT.getSizeInBits() == 8) {
12781 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12782 Op.getOperand(0), Op.getOperand(1));
12783 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12784 DAG.getValueType(VT));
12785 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12786 }
12787
12788 if (VT.getSizeInBits() == 16) {
12789 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12790 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12791 if (Idx == 0)
12792 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12793 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12794 DAG.getNode(ISD::BITCAST, dl,
12795 MVT::v4i32,
12796 Op.getOperand(0)),
12797 Op.getOperand(1)));
12798 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12799 Op.getOperand(0), Op.getOperand(1));
12800 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12801 DAG.getValueType(VT));
12802 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12803 }
12804
12805 if (VT == MVT::f32) {
12806 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12807 // the result back to FR32 register. It's only worth matching if the
12808 // result has a single use which is a store or a bitcast to i32. And in
12809 // the case of a store, it's not worth it if the index is a constant 0,
12810 // because a MOVSSmr can be used instead, which is smaller and faster.
12811 if (!Op.hasOneUse())
12812 return SDValue();
12813 SDNode *User = *Op.getNode()->use_begin();
12814 if ((User->getOpcode() != ISD::STORE ||
12815 (isa<ConstantSDNode>(Op.getOperand(1)) &&
12816 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
12817 (User->getOpcode() != ISD::BITCAST ||
12818 User->getValueType(0) != MVT::i32))
12819 return SDValue();
12820 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12821 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
12822 Op.getOperand(0)),
12823 Op.getOperand(1));
12824 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
12825 }
12826
12827 if (VT == MVT::i32 || VT == MVT::i64) {
12828 // ExtractPS/pextrq works with constant index.
12829 if (isa<ConstantSDNode>(Op.getOperand(1)))
12830 return Op;
12831 }
12832 return SDValue();
12833}
12834
12835/// Extract one bit from mask vector, like v16i1 or v8i1.
12836/// AVX-512 feature.
12837SDValue
12838X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12839 SDValue Vec = Op.getOperand(0);
12840 SDLoc dl(Vec);
12841 MVT VecVT = Vec.getSimpleValueType();
12842 SDValue Idx = Op.getOperand(1);
12843 MVT EltVT = Op.getSimpleValueType();
12844
12845 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector")(((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i1) && \"Unexpected operands in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12845, __PRETTY_FUNCTION__))
;
12846 assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI
()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12847, __PRETTY_FUNCTION__))
12847 "Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI
()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12847, __PRETTY_FUNCTION__))
;
12848
12849 // variable index can't be handled in mask registers,
12850 // extend vector to VR512
12851 if (!isa<ConstantSDNode>(Idx)) {
12852 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12853 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12854 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12855 ExtVT.getVectorElementType(), Ext, Idx);
12856 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12857 }
12858
12859 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12860 const TargetRegisterClass* rc = getRegClassFor(VecVT);
12861 if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
12862 rc = getRegClassFor(MVT::v16i1);
12863 unsigned MaxSift = rc->getSize()*8 - 1;
12864 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12865 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12866 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12867 DAG.getConstant(MaxSift, MVT::i8));
12868 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12869 DAG.getIntPtrConstant(0));
12870}
12871
12872SDValue
12873X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12874 SelectionDAG &DAG) const {
12875 SDLoc dl(Op);
12876 SDValue Vec = Op.getOperand(0);
12877 MVT VecVT = Vec.getSimpleValueType();
12878 SDValue Idx = Op.getOperand(1);
12879
12880 if (Op.getSimpleValueType() == MVT::i1)
12881 return ExtractBitFromMaskVector(Op, DAG);
12882
12883 if (!isa<ConstantSDNode>(Idx)) {
12884 if (VecVT.is512BitVector() ||
12885 (VecVT.is256BitVector() && Subtarget->hasInt256() &&
12886 VecVT.getVectorElementType().getSizeInBits() == 32)) {
12887
12888 MVT MaskEltVT =
12889 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12890 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12891 MaskEltVT.getSizeInBits());
12892
12893 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12894 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12895 getZeroVector(MaskVT, Subtarget, DAG, dl),
12896 Idx, DAG.getConstant(0, getPointerTy()));
12897 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
12899 Perm, DAG.getConstant(0, getPointerTy()));
12900 }
12901 return SDValue();
12902 }
12903
12904 // If this is a 256-bit vector result, first extract the 128-bit vector and
12905 // then extract the element from the 128-bit vector.
12906 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12907
12908 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12909 // Get the 128-bit vector.
12910 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
12911 MVT EltVT = VecVT.getVectorElementType();
12912
12913 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12914
12915 //if (IdxVal >= NumElems/2)
12916 // IdxVal -= NumElems/2;
12917 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
12918 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12919 DAG.getConstant(IdxVal, MVT::i32));
12920 }
12921
12922 assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 12922, __PRETTY_FUNCTION__))
;
12923
12924 if (Subtarget->hasSSE41()) {
12925 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
12926 if (Res.getNode())
12927 return Res;
12928 }
12929
12930 MVT VT = Op.getSimpleValueType();
12931 // TODO: handle v16i8.
12932 if (VT.getSizeInBits() == 16) {
12933 SDValue Vec = Op.getOperand(0);
12934 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12935 if (Idx == 0)
12936 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12937 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12938 DAG.getNode(ISD::BITCAST, dl,
12939 MVT::v4i32, Vec),
12940 Op.getOperand(1)));
12941 // Transform it so it match pextrw which produces a 32-bit result.
12942 MVT EltVT = MVT::i32;
12943 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
12944 Op.getOperand(0), Op.getOperand(1));
12945 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12946 DAG.getValueType(VT));
12947 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12948 }
12949
12950 if (VT.getSizeInBits() == 32) {
12951 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12952 if (Idx == 0)
12953 return Op;
12954
12955 // SHUFPS the element to the lowest double word, then movss.
12956 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
12957 MVT VVT = Op.getOperand(0).getSimpleValueType();
12958 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12959 DAG.getUNDEF(VVT), Mask);
12960 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12961 DAG.getIntPtrConstant(0));
12962 }
12963
12964 if (VT.getSizeInBits() == 64) {
12965 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12966 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12967 // to match extract_elt for f64.
12968 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12969 if (Idx == 0)
12970 return Op;
12971
12972 // UNPCKHPD the element to the lowest double word, then movsd.
12973 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12974 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12975 int Mask[2] = { 1, -1 };
12976 MVT VVT = Op.getOperand(0).getSimpleValueType();
12977 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12978 DAG.getUNDEF(VVT), Mask);
12979 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12980 DAG.getIntPtrConstant(0));
12981 }
12982
12983 return SDValue();
12984}
12985
12986/// Insert one bit to mask vector, like v16i1 or v8i1.
12987/// AVX-512 feature.
12988SDValue
12989X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12990 SDLoc dl(Op);
12991 SDValue Vec = Op.getOperand(0);
12992 SDValue Elt = Op.getOperand(1);
12993 SDValue Idx = Op.getOperand(2);
12994 MVT VecVT = Vec.getSimpleValueType();
12995
12996 if (!isa<ConstantSDNode>(Idx)) {
12997 // Non constant index. Extend source and destination,
12998 // insert element and then truncate the result.
12999 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13000 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13001 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13002 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13003 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13004 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13005 }
13006
13007 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13008 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13009 if (Vec.getOpcode() == ISD::UNDEF)
13010 return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13011 DAG.getConstant(IdxVal, MVT::i8));
13012 const TargetRegisterClass* rc = getRegClassFor(VecVT);
13013 unsigned MaxSift = rc->getSize()*8 - 1;
13014 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13015 DAG.getConstant(MaxSift, MVT::i8));
13016 EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13017 DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13018 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13019}
13020
13021SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13022 SelectionDAG &DAG) const {
13023 MVT VT = Op.getSimpleValueType();
13024 MVT EltVT = VT.getVectorElementType();
13025
13026 if (EltVT == MVT::i1)
13027 return InsertBitToMaskVector(Op, DAG);
13028
13029 SDLoc dl(Op);
13030 SDValue N0 = Op.getOperand(0);
13031 SDValue N1 = Op.getOperand(1);
13032 SDValue N2 = Op.getOperand(2);
13033 if (!isa<ConstantSDNode>(N2))
13034 return SDValue();
13035 auto *N2C = cast<ConstantSDNode>(N2);
13036 unsigned IdxVal = N2C->getZExtValue();
13037
13038 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13039 // into that, and then insert the subvector back into the result.
13040 if (VT.is256BitVector() || VT.is512BitVector()) {
13041 // Get the desired 128-bit vector half.
13042 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13043
13044 // Insert the element into the desired half.
13045 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13046 unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13047
13048 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13049 DAG.getConstant(IdxIn128, MVT::i32));
13050
13051 // Insert the changed part back to the 256-bit vector
13052 return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13053 }
13054 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13054, __PRETTY_FUNCTION__))
;
13055
13056 if (Subtarget->hasSSE41()) {
13057 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13058 unsigned Opc;
13059 if (VT == MVT::v8i16) {
13060 Opc = X86ISD::PINSRW;
13061 } else {
13062 assert(VT == MVT::v16i8)((VT == MVT::v16i8) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v16i8", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13062, __PRETTY_FUNCTION__))
;
13063 Opc = X86ISD::PINSRB;
13064 }
13065
13066 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13067 // argument.
13068 if (N1.getValueType() != MVT::i32)
13069 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13070 if (N2.getValueType() != MVT::i32)
13071 N2 = DAG.getIntPtrConstant(IdxVal);
13072 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13073 }
13074
13075 if (EltVT == MVT::f32) {
13076 // Bits [7:6] of the constant are the source select. This will always be
13077 // zero here. The DAG Combiner may combine an extract_elt index into
13078 // these
13079 // bits. For example (insert (extract, 3), 2) could be matched by
13080 // putting
13081 // the '3' into bits [7:6] of X86ISD::INSERTPS.
13082 // Bits [5:4] of the constant are the destination select. This is the
13083 // value of the incoming immediate.
13084 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13085 // combine either bitwise AND or insert of float 0.0 to set these bits.
13086 N2 = DAG.getIntPtrConstant(IdxVal << 4);
13087 // Create this as a scalar to vector..
13088 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13089 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13090 }
13091
13092 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13093 // PINSR* works with constant index.
13094 return Op;
13095 }
13096 }
13097
13098 if (EltVT == MVT::i8)
13099 return SDValue();
13100
13101 if (EltVT.getSizeInBits() == 16) {
13102 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13103 // as its second argument.
13104 if (N1.getValueType() != MVT::i32)
13105 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13106 if (N2.getValueType() != MVT::i32)
13107 N2 = DAG.getIntPtrConstant(IdxVal);
13108 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13109 }
13110 return SDValue();
13111}
13112
13113static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13114 SDLoc dl(Op);
13115 MVT OpVT = Op.getSimpleValueType();
13116
13117 // If this is a 256-bit vector result, first insert into a 128-bit
13118 // vector and then insert into the 256-bit vector.
13119 if (!OpVT.is128BitVector()) {
13120 // Insert into a 128-bit vector.
13121 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13122 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13123 OpVT.getVectorNumElements() / SizeFactor);
13124
13125 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13126
13127 // Insert the 128-bit vector.
13128 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13129 }
13130
13131 if (OpVT == MVT::v1i64 &&
13132 Op.getOperand(0).getValueType() == MVT::i64)
13133 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13134
13135 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13136 assert(OpVT.is128BitVector() && "Expected an SSE type!")((OpVT.is128BitVector() && "Expected an SSE type!") ?
static_cast<void> (0) : __assert_fail ("OpVT.is128BitVector() && \"Expected an SSE type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13136, __PRETTY_FUNCTION__))
;
13137 return DAG.getNode(ISD::BITCAST, dl, OpVT,
13138 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13139}
13140
13141// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13142// a simple subregister reference or explicit instructions to grab
13143// upper bits of a vector.
13144static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13145 SelectionDAG &DAG) {
13146 SDLoc dl(Op);
13147 SDValue In = Op.getOperand(0);
13148 SDValue Idx = Op.getOperand(1);
13149 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13150 MVT ResVT = Op.getSimpleValueType();
13151 MVT InVT = In.getSimpleValueType();
13152
13153 if (Subtarget->hasFp256()) {
13154 if (ResVT.is128BitVector() &&
13155 (InVT.is256BitVector() || InVT.is512BitVector()) &&
13156 isa<ConstantSDNode>(Idx)) {
13157 return Extract128BitVector(In, IdxVal, DAG, dl);
13158 }
13159 if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13160 isa<ConstantSDNode>(Idx)) {
13161 return Extract256BitVector(In, IdxVal, DAG, dl);
13162 }
13163 }
13164 return SDValue();
13165}
13166
13167// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13168// simple superregister reference or explicit instructions to insert
13169// the upper bits of a vector.
13170static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13171 SelectionDAG &DAG) {
13172 if (!Subtarget->hasAVX())
13173 return SDValue();
13174
13175 SDLoc dl(Op);
13176 SDValue Vec = Op.getOperand(0);
13177 SDValue SubVec = Op.getOperand(1);
13178 SDValue Idx = Op.getOperand(2);
13179 MVT OpVT = Op.getSimpleValueType();
13180 MVT SubVecVT = SubVec.getSimpleValueType();
13181
13182 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13183 SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
13184 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13185 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13186 }
13187
13188 if (OpVT.is512BitVector() &&
13189 SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {
13190 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13191 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13192 }
13193
13194 return SDValue();
13195}
13196
13197// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13198// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13199// one of the above mentioned nodes. It has to be wrapped because otherwise
13200// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13201// be used to form addressing mode. These wrapped nodes will be selected
13202// into MOV32ri.
13203SDValue
13204X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13205 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13206
13207 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13208 // global base reg.
13209 unsigned char OpFlag = 0;
13210 unsigned WrapperKind = X86ISD::Wrapper;
13211 CodeModel::Model M = DAG.getTarget().getCodeModel();
13212
13213 if (Subtarget->isPICStyleRIPRel() &&
13214 (M == CodeModel::Small || M == CodeModel::Kernel))
13215 WrapperKind = X86ISD::WrapperRIP;
13216 else if (Subtarget->isPICStyleGOT())
13217 OpFlag = X86II::MO_GOTOFF;
13218 else if (Subtarget->isPICStyleStubPIC())
13219 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13220
13221 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13222 CP->getAlignment(),
13223 CP->getOffset(), OpFlag);
13224 SDLoc DL(CP);
13225 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13226 // With PIC, the address is actually $g + Offset.
13227 if (OpFlag) {
13228 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13229 DAG.getNode(X86ISD::GlobalBaseReg,
13230 SDLoc(), getPointerTy()),
13231 Result);
13232 }
13233
13234 return Result;
13235}
13236
13237SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13238 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13239
13240 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13241 // global base reg.
13242 unsigned char OpFlag = 0;
13243 unsigned WrapperKind = X86ISD::Wrapper;
13244 CodeModel::Model M = DAG.getTarget().getCodeModel();
13245
13246 if (Subtarget->isPICStyleRIPRel() &&
13247 (M == CodeModel::Small || M == CodeModel::Kernel))
13248 WrapperKind = X86ISD::WrapperRIP;
13249 else if (Subtarget->isPICStyleGOT())
13250 OpFlag = X86II::MO_GOTOFF;
13251 else if (Subtarget->isPICStyleStubPIC())
13252 OpFlag = X86II::MO_PIC_BASE_OFFSET;
13253
13254 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13255 OpFlag);
13256 SDLoc DL(JT);
13257 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13258
13259 // With PIC, the address is actually $g + Offset.
13260 if (OpFlag)
13261 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13262 DAG.getNode(X86ISD::GlobalBaseReg,
13263 SDLoc(), getPointerTy()),
13264 Result);
13265
13266 return Result;
13267}
13268
13269SDValue
13270X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13271 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13272
13273 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13274 // global base reg.
13275 unsigned char OpFlag = 0;
13276 unsigned WrapperKind = X86ISD::Wrapper;
13277 CodeModel::Model M = DAG.getTarget().getCodeModel();
13278
13279 if (Subtarget->isPICStyleRIPRel() &&
13280 (M == CodeModel::Small || M == CodeModel::Kernel)) {
13281 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13282 OpFlag = X86II::MO_GOTPCREL;
13283 WrapperKind = X86ISD::WrapperRIP;
13284 } else if (Subtarget->isPICStyleGOT()) {
13285 OpFlag = X86II::MO_GOT;
13286 } else if (Subtarget->isPICStyleStubPIC()) {
13287 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13288 } else if (Subtarget->isPICStyleStubNoDynamic()) {
13289 OpFlag = X86II::MO_DARWIN_NONLAZY;
13290 }
13291
13292 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13293
13294 SDLoc DL(Op);
13295 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13296
13297 // With PIC, the address is actually $g + Offset.
13298 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13299 !Subtarget->is64Bit()) {
13300 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13301 DAG.getNode(X86ISD::GlobalBaseReg,
13302 SDLoc(), getPointerTy()),
13303 Result);
13304 }
13305
13306 // For symbols that require a load from a stub to get the address, emit the
13307 // load.
13308 if (isGlobalStubReference(OpFlag))
13309 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13310 MachinePointerInfo::getGOT(), false, false, false, 0);
13311
13312 return Result;
13313}
13314
13315SDValue
13316X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13317 // Create the TargetBlockAddressAddress node.
13318 unsigned char OpFlags =
13319 Subtarget->ClassifyBlockAddressReference();
13320 CodeModel::Model M = DAG.getTarget().getCodeModel();
13321 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13322 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13323 SDLoc dl(Op);
13324 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13325 OpFlags);
13326
13327 if (Subtarget->isPICStyleRIPRel() &&
13328 (M == CodeModel::Small || M == CodeModel::Kernel))
13329 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13330 else
13331 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13332
13333 // With PIC, the address is actually $g + Offset.
13334 if (isGlobalRelativeToPICBase(OpFlags)) {
13335 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13336 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13337 Result);
13338 }
13339
13340 return Result;
13341}
13342
13343SDValue
13344X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13345 int64_t Offset, SelectionDAG &DAG) const {
13346 // Create the TargetGlobalAddress node, folding in the constant
13347 // offset if it is legal.
13348 unsigned char OpFlags =
13349 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13350 CodeModel::Model M = DAG.getTarget().getCodeModel();
13351 SDValue Result;
13352 if (OpFlags == X86II::MO_NO_FLAG &&
13353 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13354 // A direct static reference to a global.
13355 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13356 Offset = 0;
13357 } else {
13358 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13359 }
13360
13361 if (Subtarget->isPICStyleRIPRel() &&
13362 (M == CodeModel::Small || M == CodeModel::Kernel))
13363 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13364 else
13365 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13366
13367 // With PIC, the address is actually $g + Offset.
13368 if (isGlobalRelativeToPICBase(OpFlags)) {
13369 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13370 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13371 Result);
13372 }
13373
13374 // For globals that require a load from a stub to get the address, emit the
13375 // load.
13376 if (isGlobalStubReference(OpFlags))
13377 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13378 MachinePointerInfo::getGOT(), false, false, false, 0);
13379
13380 // If there was a non-zero offset that we didn't fold, create an explicit
13381 // addition for it.
13382 if (Offset != 0)
13383 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13384 DAG.getConstant(Offset, getPointerTy()));
13385
13386 return Result;
13387}
13388
13389SDValue
13390X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13391 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13392 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13393 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13394}
13395
13396static SDValue
13397GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13398 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13399 unsigned char OperandFlags, bool LocalDynamic = false) {
13400 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13401 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13402 SDLoc dl(GA);
13403 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13404 GA->getValueType(0),
13405 GA->getOffset(),
13406 OperandFlags);
13407
13408 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13409 : X86ISD::TLSADDR;
13410
13411 if (InFlag) {
13412 SDValue Ops[] = { Chain, TGA, *InFlag };
13413 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13414 } else {
13415 SDValue Ops[] = { Chain, TGA };
13416 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13417 }
13418
13419 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13420 MFI->setAdjustsStack(true);
13421 MFI->setHasCalls(true);
13422
13423 SDValue Flag = Chain.getValue(1);
13424 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13425}
13426
13427// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13428static SDValue
13429LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13430 const EVT PtrVT) {
13431 SDValue InFlag;
13432 SDLoc dl(GA); // ? function entry point might be better
13433 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13434 DAG.getNode(X86ISD::GlobalBaseReg,
13435 SDLoc(), PtrVT), InFlag);
13436 InFlag = Chain.getValue(1);
13437
13438 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13439}
13440
13441// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13442static SDValue
13443LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13444 const EVT PtrVT) {
13445 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13446 X86::RAX, X86II::MO_TLSGD);
13447}
13448
13449static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13450 SelectionDAG &DAG,
13451 const EVT PtrVT,
13452 bool is64Bit) {
13453 SDLoc dl(GA);
13454
13455 // Get the start address of the TLS block for this module.
13456 X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13457 .getInfo<X86MachineFunctionInfo>();
13458 MFI->incNumLocalDynamicTLSAccesses();
13459
13460 SDValue Base;
13461 if (is64Bit) {
13462 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13463 X86II::MO_TLSLD, /*LocalDynamic=*/true);
13464 } else {
13465 SDValue InFlag;
13466 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13467 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13468 InFlag = Chain.getValue(1);
13469 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13470 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13471 }
13472
13473 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13474 // of Base.
13475
13476 // Build x@dtpoff.
13477 unsigned char OperandFlags = X86II::MO_DTPOFF;
13478 unsigned WrapperKind = X86ISD::Wrapper;
13479 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13480 GA->getValueType(0),
13481 GA->getOffset(), OperandFlags);
13482 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13483
13484 // Add x@dtpoff with the base.
13485 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13486}
13487
13488// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13489static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13490 const EVT PtrVT, TLSModel::Model model,
13491 bool is64Bit, bool isPIC) {
13492 SDLoc dl(GA);
13493
13494 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13495 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13496 is64Bit ? 257 : 256));
13497
13498 SDValue ThreadPointer =
13499 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13500 MachinePointerInfo(Ptr), false, false, false, 0);
13501
13502 unsigned char OperandFlags = 0;
13503 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
13504 // initialexec.
13505 unsigned WrapperKind = X86ISD::Wrapper;
13506 if (model == TLSModel::LocalExec) {
13507 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13508 } else if (model == TLSModel::InitialExec) {
13509 if (is64Bit) {
13510 OperandFlags = X86II::MO_GOTTPOFF;
13511 WrapperKind = X86ISD::WrapperRIP;
13512 } else {
13513 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13514 }
13515 } else {
13516 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13516)
;
13517 }
13518
13519 // emit "addl x@ntpoff,%eax" (local exec)
13520 // or "addl x@indntpoff,%eax" (initial exec)
13521 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13522 SDValue TGA =
13523 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13524 GA->getOffset(), OperandFlags);
13525 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13526
13527 if (model == TLSModel::InitialExec) {
13528 if (isPIC && !is64Bit) {
13529 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13530 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13531 Offset);
13532 }
13533
13534 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13535 MachinePointerInfo::getGOT(), false, false, false, 0);
13536 }
13537
13538 // The address of the thread local variable is the add of the thread
13539 // pointer with the offset of the variable.
13540 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13541}
13542
13543SDValue
13544X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13545
13546 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13547 const GlobalValue *GV = GA->getGlobal();
13548
13549 if (Subtarget->isTargetELF()) {
13550 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13551
13552 switch (model) {
13553 case TLSModel::GeneralDynamic:
13554 if (Subtarget->is64Bit())
13555 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13556 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13557 case TLSModel::LocalDynamic:
13558 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13559 Subtarget->is64Bit());
13560 case TLSModel::InitialExec:
13561 case TLSModel::LocalExec:
13562 return LowerToTLSExecModel(
13563 GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13564 DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13565 }
13566 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13566)
;
13567 }
13568
13569 if (Subtarget->isTargetDarwin()) {
13570 // Darwin only has one model of TLS. Lower to that.
13571 unsigned char OpFlag = 0;
13572 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13573 X86ISD::WrapperRIP : X86ISD::Wrapper;
13574
13575 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13576 // global base reg.
13577 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13578 !Subtarget->is64Bit();
13579 if (PIC32)
13580 OpFlag = X86II::MO_TLVP_PIC_BASE;
13581 else
13582 OpFlag = X86II::MO_TLVP;
13583 SDLoc DL(Op);
13584 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13585 GA->getValueType(0),
13586 GA->getOffset(), OpFlag);
13587 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13588
13589 // With PIC32, the address is actually $g + Offset.
13590 if (PIC32)
13591 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13592 DAG.getNode(X86ISD::GlobalBaseReg,
13593 SDLoc(), getPointerTy()),
13594 Offset);
13595
13596 // Lowering the machine isd will make sure everything is in the right
13597 // location.
13598 SDValue Chain = DAG.getEntryNode();
13599 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13600 SDValue Args[] = { Chain, Offset };
13601 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13602
13603 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13604 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13605 MFI->setAdjustsStack(true);
13606
13607 // And our return value (tls address) is in the standard call return value
13608 // location.
13609 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13610 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13611 Chain.getValue(1));
13612 }
13613
13614 if (Subtarget->isTargetKnownWindowsMSVC() ||
13615 Subtarget->isTargetWindowsGNU()) {
13616 // Just use the implicit TLS architecture
13617 // Need to generate someting similar to:
13618 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13619 // ; from TEB
13620 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
13621 // mov rcx, qword [rdx+rcx*8]
13622 // mov eax, .tls$:tlsvar
13623 // [rax+rcx] contains the address
13624 // Windows 64bit: gs:0x58
13625 // Windows 32bit: fs:__tls_array
13626
13627 SDLoc dl(GA);
13628 SDValue Chain = DAG.getEntryNode();
13629
13630 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13631 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13632 // use its literal value of 0x2C.
13633 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13634 ? Type::getInt8PtrTy(*DAG.getContext(),
13635 256)
13636 : Type::getInt32PtrTy(*DAG.getContext(),
13637 257));
13638
13639 SDValue TlsArray =
13640 Subtarget->is64Bit()
13641 ? DAG.getIntPtrConstant(0x58)
13642 : (Subtarget->isTargetWindowsGNU()
13643 ? DAG.getIntPtrConstant(0x2C)
13644 : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13645
13646 SDValue ThreadPointer =
13647 DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13648 MachinePointerInfo(Ptr), false, false, false, 0);
13649
13650 // Load the _tls_index variable
13651 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13652 if (Subtarget->is64Bit())
13653 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13654 IDX, MachinePointerInfo(), MVT::i32,
13655 false, false, false, 0);
13656 else
13657 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13658 false, false, false, 0);
13659
13660 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13661 getPointerTy());
13662 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13663
13664 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13665 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13666 false, false, false, 0);
13667
13668 // Get the offset of start of .tls section
13669 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13670 GA->getValueType(0),
13671 GA->getOffset(), X86II::MO_SECREL);
13672 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13673
13674 // The address of the thread local variable is the add of the thread
13675 // pointer with the offset of the variable.
13676 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13677 }
13678
13679 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13679)
;
13680}
13681
13682/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13683/// and take a 2 x i32 value to shift plus a shift amount.
13684static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13685 assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13685, __PRETTY_FUNCTION__))
;
13686 MVT VT = Op.getSimpleValueType();
13687 unsigned VTBits = VT.getSizeInBits();
13688 SDLoc dl(Op);
13689 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13690 SDValue ShOpLo = Op.getOperand(0);
13691 SDValue ShOpHi = Op.getOperand(1);
13692 SDValue ShAmt = Op.getOperand(2);
13693 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13694 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13695 // during isel.
13696 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13697 DAG.getConstant(VTBits - 1, MVT::i8));
13698 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13699 DAG.getConstant(VTBits - 1, MVT::i8))
13700 : DAG.getConstant(0, VT);
13701
13702 SDValue Tmp2, Tmp3;
13703 if (Op.getOpcode() == ISD::SHL_PARTS) {
13704 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13705 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13706 } else {
13707 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13708 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13709 }
13710
13711 // If the shift amount is larger or equal than the width of a part we can't
13712 // rely on the results of shld/shrd. Insert a test and select the appropriate
13713 // values for large shift amounts.
13714 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13715 DAG.getConstant(VTBits, MVT::i8));
13716 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13717 AndNode, DAG.getConstant(0, MVT::i8));
13718
13719 SDValue Hi, Lo;
13720 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13721 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13722 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13723
13724 if (Op.getOpcode() == ISD::SHL_PARTS) {
13725 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13726 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13727 } else {
13728 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13729 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13730 }
13731
13732 SDValue Ops[2] = { Lo, Hi };
13733 return DAG.getMergeValues(Ops, dl);
13734}
13735
13736SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13737 SelectionDAG &DAG) const {
13738 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13739 SDLoc dl(Op);
13740
13741 if (SrcVT.isVector()) {
13742 if (SrcVT.getVectorElementType() == MVT::i1) {
13743 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13744 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13745 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13746 Op.getOperand(0)));
13747 }
13748 return SDValue();
13749 }
13750
13751 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13752, __PRETTY_FUNCTION__))
13752 "Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13752, __PRETTY_FUNCTION__))
;
13753
13754 // These are really Legal; return the operand so the caller accepts it as
13755 // Legal.
13756 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13757 return Op;
13758 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13759 Subtarget->is64Bit()) {
13760 return Op;
13761 }
13762
13763 unsigned Size = SrcVT.getSizeInBits()/8;
13764 MachineFunction &MF = DAG.getMachineFunction();
13765 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13766 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13767 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13768 StackSlot,
13769 MachinePointerInfo::getFixedStack(SSFI),
13770 false, false, 0);
13771 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13772}
13773
13774SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13775 SDValue StackSlot,
13776 SelectionDAG &DAG) const {
13777 // Build the FILD
13778 SDLoc DL(Op);
13779 SDVTList Tys;
13780 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13781 if (useSSE)
13782 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13783 else
13784 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13785
13786 unsigned ByteSize = SrcVT.getSizeInBits()/8;
13787
13788 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13789 MachineMemOperand *MMO;
13790 if (FI) {
13791 int SSFI = FI->getIndex();
13792 MMO =
13793 DAG.getMachineFunction()
13794 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13795 MachineMemOperand::MOLoad, ByteSize, ByteSize);
13796 } else {
13797 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13798 StackSlot = StackSlot.getOperand(1);
13799 }
13800 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13801 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13802 X86ISD::FILD, DL,
13803 Tys, Ops, SrcVT, MMO);
13804
13805 if (useSSE) {
13806 Chain = Result.getValue(1);
13807 SDValue InFlag = Result.getValue(2);
13808
13809 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13810 // shouldn't be necessary except that RFP cannot be live across
13811 // multiple blocks. When stackifier is fixed, they can be uncoupled.
13812 MachineFunction &MF = DAG.getMachineFunction();
13813 unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13814 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13815 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13816 Tys = DAG.getVTList(MVT::Other);
13817 SDValue Ops[] = {
13818 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13819 };
13820 MachineMemOperand *MMO =
13821 DAG.getMachineFunction()
13822 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13823 MachineMemOperand::MOStore, SSFISize, SSFISize);
13824
13825 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13826 Ops, Op.getValueType(), MMO);
13827 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
13828 MachinePointerInfo::getFixedStack(SSFI),
13829 false, false, false, 0);
13830 }
13831
13832 return Result;
13833}
13834
13835// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
13836SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13837 SelectionDAG &DAG) const {
13838 // This algorithm is not obvious. Here it is what we're trying to output:
13839 /*
13840 movq %rax, %xmm0
13841 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13842 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13843 #ifdef __SSE3__
13844 haddpd %xmm0, %xmm0
13845 #else
13846 pshufd $0x4e, %xmm0, %xmm1
13847 addpd %xmm1, %xmm0
13848 #endif
13849 */
13850
13851 SDLoc dl(Op);
13852 LLVMContext *Context = DAG.getContext();
13853
13854 // Build some magic constants.
13855 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13856 Constant *C0 = ConstantDataVector::get(*Context, CV0);
13857 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
13858
13859 SmallVector<Constant*,2> CV1;
13860 CV1.push_back(
13861 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13862 APInt(64, 0x4330000000000000ULL))));
13863 CV1.push_back(
13864 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13865 APInt(64, 0x4530000000000000ULL))));
13866 Constant *C1 = ConstantVector::get(CV1);
13867 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
13868
13869 // Load the 64-bit value into an XMM register.
13870 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13871 Op.getOperand(0));
13872 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13873 MachinePointerInfo::getConstantPool(),
13874 false, false, false, 16);
13875 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
13876 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
13877 CLod0);
13878
13879 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13880 MachinePointerInfo::getConstantPool(),
13881 false, false, false, 16);
13882 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
13883 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13884 SDValue Result;
13885
13886 if (Subtarget->hasSSE3()) {
13887 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13888 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13889 } else {
13890 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
13891 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13892 S2F, 0x4E, DAG);
13893 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13894 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
13895 Sub);
13896 }
13897
13898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13899 DAG.getIntPtrConstant(0));
13900}
13901
13902// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
13903SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13904 SelectionDAG &DAG) const {
13905 SDLoc dl(Op);
13906 // FP constant to bias correct the final result.
13907 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13908 MVT::f64);
13909
13910 // Load the 32-bit value into an XMM register.
13911 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13912 Op.getOperand(0));
13913
13914 // Zero out the upper parts of the register.
13915 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13916
13917 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13918 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
13919 DAG.getIntPtrConstant(0));
13920
13921 // Or the load with the bias.
13922 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
13923 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13924 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13925 MVT::v2f64, Load)),
13926 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13927 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13928 MVT::v2f64, Bias)));
13929 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13930 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
13931 DAG.getIntPtrConstant(0));
13932
13933 // Subtract the bias.
13934 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13935
13936 // Handle final rounding.
13937 EVT DestVT = Op.getValueType();
13938
13939 if (DestVT.bitsLT(MVT::f64))
13940 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13941 DAG.getIntPtrConstant(0));
13942 if (DestVT.bitsGT(MVT::f64))
13943 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13944
13945 // Handle final rounding.
13946 return Sub;
13947}
13948
13949static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13950 const X86Subtarget &Subtarget) {
13951 // The algorithm is the following:
13952 // #ifdef __SSE4_1__
13953 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13954 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13955 // (uint4) 0x53000000, 0xaa);
13956 // #else
13957 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13958 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13959 // #endif
13960 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13961 // return (float4) lo + fhi;
13962
13963 SDLoc DL(Op);
13964 SDValue V = Op->getOperand(0);
13965 EVT VecIntVT = V.getValueType();
13966 bool Is128 = VecIntVT == MVT::v4i32;
13967 EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13968 // If we convert to something else than the supported type, e.g., to v4f64,
13969 // abort early.
13970 if (VecFloatVT != Op->getValueType(0))
13971 return SDValue();
13972
13973 unsigned NumElts = VecIntVT.getVectorNumElements();
13974 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13975, __PRETTY_FUNCTION__))
13975 "Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13975, __PRETTY_FUNCTION__))
;
13976 assert(NumElts <= 8 && "The size of the constant array must be fixed")((NumElts <= 8 && "The size of the constant array must be fixed"
) ? static_cast<void> (0) : __assert_fail ("NumElts <= 8 && \"The size of the constant array must be fixed\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 13976, __PRETTY_FUNCTION__))
;
13977
13978 // In the #idef/#else code, we have in common:
13979 // - The vector of constants:
13980 // -- 0x4b000000
13981 // -- 0x53000000
13982 // - A shift:
13983 // -- v >> 16
13984
13985 // Create the splat vector for 0x4b000000.
13986 SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
13987 SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
13988 CstLow, CstLow, CstLow, CstLow};
13989 SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13990 makeArrayRef(&CstLowArray[0], NumElts));
13991 // Create the splat vector for 0x53000000.
13992 SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
13993 SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
13994 CstHigh, CstHigh, CstHigh, CstHigh};
13995 SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13996 makeArrayRef(&CstHighArray[0], NumElts));
13997
13998 // Create the right shift.
13999 SDValue CstShift = DAG.getConstant(16, MVT::i32);
14000 SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14001 CstShift, CstShift, CstShift, CstShift};
14002 SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14003 makeArrayRef(&CstShiftArray[0], NumElts));
14004 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14005
14006 SDValue Low, High;
14007 if (Subtarget.hasSSE41()) {
14008 EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14009 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14010 SDValue VecCstLowBitcast =
14011 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14012 SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14013 // Low will be bitcasted right away, so do not bother bitcasting back to its
14014 // original type.
14015 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14016 VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14017 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14018 // (uint4) 0x53000000, 0xaa);
14019 SDValue VecCstHighBitcast =
14020 DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14021 SDValue VecShiftBitcast =
14022 DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14023 // High will be bitcasted right away, so do not bother bitcasting back to
14024 // its original type.
14025 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14026 VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14027 } else {
14028 SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14029 SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14030 CstMask, CstMask, CstMask);
14031 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14032 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14033 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14034
14035 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14036 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14037 }
14038
14039 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14040 SDValue CstFAdd = DAG.getConstantFP(
14041 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14042 SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14043 CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14044 SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14045 makeArrayRef(&CstFAddArray[0], NumElts));
14046
14047 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14048 SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14049 SDValue FHigh =
14050 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14051 // return (float4) lo + fhi;
14052 SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14053 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14054}
14055
14056SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14057 SelectionDAG &DAG) const {
14058 SDValue N0 = Op.getOperand(0);
14059 MVT SVT = N0.getSimpleValueType();
14060 SDLoc dl(Op);
14061
14062 switch (SVT.SimpleTy) {
14063 default:
14064 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14064)
;
14065 case MVT::v4i8:
14066 case MVT::v4i16:
14067 case MVT::v8i8:
14068 case MVT::v8i16: {
14069 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14070 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14071 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14072 }
14073 case MVT::v4i32:
14074 case MVT::v8i32:
14075 return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14076 }
14077 llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14077)
;
14078}
14079
14080SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14081 SelectionDAG &DAG) const {
14082 SDValue N0 = Op.getOperand(0);
14083 SDLoc dl(Op);
14084
14085 if (Op.getValueType().isVector())
14086 return lowerUINT_TO_FP_vec(Op, DAG);
14087
14088 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14089 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14090 // the optimization here.
14091 if (DAG.SignBitIsZero(N0))
14092 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14093
14094 MVT SrcVT = N0.getSimpleValueType();
14095 MVT DstVT = Op.getSimpleValueType();
14096 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14097 return LowerUINT_TO_FP_i64(Op, DAG);
14098 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14099 return LowerUINT_TO_FP_i32(Op, DAG);
14100 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14101 return SDValue();
14102
14103 // Make a 64-bit buffer, and use it to build an FILD.
14104 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14105 if (SrcVT == MVT::i32) {
14106 SDValue WordOff = DAG.getConstant(4, getPointerTy());
14107 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14108 getPointerTy(), StackSlot, WordOff);
14109 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14110 StackSlot, MachinePointerInfo(),
14111 false, false, 0);
14112 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14113 OffsetSlot, MachinePointerInfo(),
14114 false, false, 0);
14115 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14116 return Fild;
14117 }
14118
14119 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14119, __PRETTY_FUNCTION__))
;
14120 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14121 StackSlot, MachinePointerInfo(),
14122 false, false, 0);
14123 // For i64 source, we need to add the appropriate power of 2 if the input
14124 // was negative. This is the same as the optimization in
14125 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14126 // we must be careful to do the computation in x87 extended precision, not
14127 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14128 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14129 MachineMemOperand *MMO =
14130 DAG.getMachineFunction()
14131 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14132 MachineMemOperand::MOLoad, 8, 8);
14133
14134 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14135 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14136 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14137 MVT::i64, MMO);
14138
14139 APInt FF(32, 0x5F800000ULL);
14140
14141 // Check whether the sign bit is set.
14142 SDValue SignSet = DAG.getSetCC(dl,
14143 getSetCCResultType(*DAG.getContext(), MVT::i64),
14144 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14145 ISD::SETLT);
14146
14147 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14148 SDValue FudgePtr = DAG.getConstantPool(
14149 ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14150 getPointerTy());
14151
14152 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14153 SDValue Zero = DAG.getIntPtrConstant(0);
14154 SDValue Four = DAG.getIntPtrConstant(4);
14155 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14156 Zero, Four);
14157 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14158
14159 // Load the value out, extending it from f32 to f80.
14160 // FIXME: Avoid the extend by constructing the right constant pool?
14161 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14162 FudgePtr, MachinePointerInfo::getConstantPool(),
14163 MVT::f32, false, false, false, 4);
14164 // Extend everything to 80 bits to force it to be done on x87.
14165 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14166 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14167}
14168
14169std::pair<SDValue,SDValue>
14170X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14171 bool IsSigned, bool IsReplace) const {
14172 SDLoc DL(Op);
14173
14174 EVT DstTy = Op.getValueType();
14175
14176 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14177 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14177, __PRETTY_FUNCTION__))
;
14178 DstTy = MVT::i64;
14179 }
14180
14181 assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))
14182 DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))
14183 "Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))
;
14184
14185 // These are really Legal.
14186 if (DstTy == MVT::i32 &&
14187 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14188 return std::make_pair(SDValue(), SDValue());
14189 if (Subtarget->is64Bit() &&
14190 DstTy == MVT::i64 &&
14191 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14192 return std::make_pair(SDValue(), SDValue());
14193
14194 // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14195 // stack slot, or into the FTOL runtime function.
14196 MachineFunction &MF = DAG.getMachineFunction();
14197 unsigned MemSize = DstTy.getSizeInBits()/8;
14198 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14199 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14200
14201 unsigned Opc;
14202 if (!IsSigned && isIntegerTypeFTOL(DstTy))
14203 Opc = X86ISD::WIN_FTOL;
14204 else
14205 switch (DstTy.getSimpleVT().SimpleTy) {
14206 default: llvm_unreachable("Invalid FP_TO_SINT to lower!")::llvm::llvm_unreachable_internal("Invalid FP_TO_SINT to lower!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14206)
;
14207 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14208 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14209 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14210 }
14211
14212 SDValue Chain = DAG.getEntryNode();
14213 SDValue Value = Op.getOperand(0);
14214 EVT TheVT = Op.getOperand(0).getValueType();
14215 // FIXME This causes a redundant load/store if the SSE-class value is already
14216 // in memory, such as if it is on the callstack.
14217 if (isScalarFPTypeInSSEReg(TheVT)) {
14218 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14218, __PRETTY_FUNCTION__))
;
14219 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14220 MachinePointerInfo::getFixedStack(SSFI),
14221 false, false, 0);
14222 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14223 SDValue Ops[] = {
14224 Chain, StackSlot, DAG.getValueType(TheVT)
14225 };
14226
14227 MachineMemOperand *MMO =
14228 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14229 MachineMemOperand::MOLoad, MemSize, MemSize);
14230 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14231 Chain = Value.getValue(1);
14232 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14233 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14234 }
14235
14236 MachineMemOperand *MMO =
14237 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14238 MachineMemOperand::MOStore, MemSize, MemSize);
14239
14240 if (Opc != X86ISD::WIN_FTOL) {
14241 // Build the FP_TO_INT*_IN_MEM
14242 SDValue Ops[] = { Chain, Value, StackSlot };
14243 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14244 Ops, DstTy, MMO);
14245 return std::make_pair(FIST, StackSlot);
14246 } else {
14247 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14248 DAG.getVTList(MVT::Other, MVT::Glue),
14249 Chain, Value);
14250 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14251 MVT::i32, ftol.getValue(1));
14252 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14253 MVT::i32, eax.getValue(2));
14254 SDValue Ops[] = { eax, edx };
14255 SDValue pair = IsReplace
14256 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14257 : DAG.getMergeValues(Ops, DL);
14258 return std::make_pair(pair, SDValue());
14259 }
14260}
14261
14262static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14263 const X86Subtarget *Subtarget) {
14264 MVT VT = Op->getSimpleValueType(0);
14265 SDValue In = Op->getOperand(0);
14266 MVT InVT = In.getSimpleValueType();
14267 SDLoc dl(Op);
14268
14269 // Optimize vectors in AVX mode:
14270 //
14271 // v8i16 -> v8i32
14272 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
14273 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
14274 // Concat upper and lower parts.
14275 //
14276 // v4i32 -> v4i64
14277 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
14278 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
14279 // Concat upper and lower parts.
14280 //
14281
14282 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14283 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14284 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14285 return SDValue();
14286
14287 if (Subtarget->hasInt256())
14288 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14289
14290 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14291 SDValue Undef = DAG.getUNDEF(InVT);
14292 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14293 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14294 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14295
14296 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14297 VT.getVectorNumElements()/2);
14298
14299 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14300 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14301
14302 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14303}
14304
14305static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14306 SelectionDAG &DAG) {
14307 MVT VT = Op->getSimpleValueType(0);
14308 SDValue In = Op->getOperand(0);
14309 MVT InVT = In.getSimpleValueType();
14310 SDLoc DL(Op);
14311 unsigned int NumElts = VT.getVectorNumElements();
14312 if (NumElts != 8 && NumElts != 16)
14313 return SDValue();
14314
14315 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14316 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14317
14318 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14319 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14320 // Now we have only mask extension
14321 assert(InVT.getVectorElementType() == MVT::i1)((InVT.getVectorElementType() == MVT::i1) ? static_cast<void
> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14321, __PRETTY_FUNCTION__))
;
14322 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14323 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14324 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14325 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14326 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14327 MachinePointerInfo::getConstantPool(),
14328 false, false, false, Alignment);
14329
14330 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14331 if (VT.is512BitVector())
14332 return Brcst;
14333 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14334}
14335
14336static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14337 SelectionDAG &DAG) {
14338 if (Subtarget->hasFp256()) {
14339 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14340 if (Res.getNode())
14341 return Res;
14342 }
14343
14344 return SDValue();
14345}
14346
14347static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14348 SelectionDAG &DAG) {
14349 SDLoc DL(Op);
14350 MVT VT = Op.getSimpleValueType();
14351 SDValue In = Op.getOperand(0);
14352 MVT SVT = In.getSimpleValueType();
14353
14354 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14355 return LowerZERO_EXTEND_AVX512(Op, DAG);
14356
14357 if (Subtarget->hasFp256()) {
14358 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14359 if (Res.getNode())
14360 return Res;
14361 }
14362
14363 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||((!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements
() != SVT.getVectorNumElements()) ? static_cast<void> (
0) : __assert_fail ("!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14364, __PRETTY_FUNCTION__))
14364 VT.getVectorNumElements() != SVT.getVectorNumElements())((!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements
() != SVT.getVectorNumElements()) ? static_cast<void> (
0) : __assert_fail ("!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14364, __PRETTY_FUNCTION__))
;
14365 return SDValue();
14366}
14367
14368SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14369 SDLoc DL(Op);
14370 MVT VT = Op.getSimpleValueType();
14371 SDValue In = Op.getOperand(0);
14372 MVT InVT = In.getSimpleValueType();
14373
14374 if (VT == MVT::i1) {
14375 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&(((InVT.isInteger() && (InVT.getSizeInBits() <= 64
)) && "Invalid scalar TRUNCATE operation") ? static_cast
<void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14376, __PRETTY_FUNCTION__))
14376 "Invalid scalar TRUNCATE operation")(((InVT.isInteger() && (InVT.getSizeInBits() <= 64
)) && "Invalid scalar TRUNCATE operation") ? static_cast
<void> (0) : __assert_fail ("(InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && \"Invalid scalar TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14376, __PRETTY_FUNCTION__))
;
14377 if (InVT.getSizeInBits() >= 32)
14378 return SDValue();
14379 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14380 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14381 }
14382 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14383, __PRETTY_FUNCTION__))
14383 "Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14383, __PRETTY_FUNCTION__))
;
14384
14385 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14386 if (VT.getVectorElementType().getSizeInBits() >=8)
14387 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14388
14389 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14389, __PRETTY_FUNCTION__))
;
14390 unsigned NumElts = InVT.getVectorNumElements();
14391 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type")(((NumElts == 8 || NumElts == 16) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14391, __PRETTY_FUNCTION__))
;
14392 if (InVT.getSizeInBits() < 512) {
14393 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14394 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14395 InVT = ExtVT;
14396 }
14397
14398 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14399 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14400 SDValue CP = DAG.getConstantPool(C, getPointerTy());
14401 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14402 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14403 MachinePointerInfo::getConstantPool(),
14404 false, false, false, Alignment);
14405 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14406 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14407 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14408 }
14409
14410 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14411 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14412 if (Subtarget->hasInt256()) {
14413 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14414 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14415 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14416 ShufMask);
14417 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14418 DAG.getIntPtrConstant(0));
14419 }
14420
14421 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14422 DAG.getIntPtrConstant(0));
14423 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14424 DAG.getIntPtrConstant(2));
14425 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14426 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14427 static const int ShufMask[] = {0, 2, 4, 6};
14428 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14429 }
14430
14431 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14432 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14433 if (Subtarget->hasInt256()) {
14434 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14435
14436 SmallVector<SDValue,32> pshufbMask;
14437 for (unsigned i = 0; i < 2; ++i) {
14438 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14439 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14440 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14441 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14442 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14443 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14444 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14445 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14446 for (unsigned j = 0; j < 8; ++j)
14447 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14448 }
14449 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14450 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14451 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14452
14453 static const int ShufMask[] = {0, 2, -1, -1};
14454 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
14455 &ShufMask[0]);
14456 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14457 DAG.getIntPtrConstant(0));
14458 return DAG.getNode(ISD::BITCAST, DL, VT, In);
14459 }
14460
14461 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14462 DAG.getIntPtrConstant(0));
14463
14464 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14465 DAG.getIntPtrConstant(4));
14466
14467 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14468 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14469
14470 // The PSHUFB mask:
14471 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
14472 -1, -1, -1, -1, -1, -1, -1, -1};
14473
14474 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14475 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14476 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14477
14478 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14479 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14480
14481 // The MOVLHPS Mask:
14482 static const int ShufMask2[] = {0, 1, 4, 5};
14483 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14484 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14485 }
14486
14487 // Handle truncation of V256 to V128 using shuffles.
14488 if (!VT.is128BitVector() || !InVT.is256BitVector())
14489 return SDValue();
14490
14491 assert(Subtarget->hasFp256() && "256-bit vector without AVX!")((Subtarget->hasFp256() && "256-bit vector without AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasFp256() && \"256-bit vector without AVX!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14491, __PRETTY_FUNCTION__))
;
14492
14493 unsigned NumElems = VT.getVectorNumElements();
14494 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14495
14496 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14497 // Prepare truncation shuffle mask
14498 for (unsigned i = 0; i != NumElems; ++i)
14499 MaskVec[i] = i * 2;
14500 SDValue V = DAG.getVectorShuffle(NVT, DL,
14501 DAG.getNode(ISD::BITCAST, DL, NVT, In),
14502 DAG.getUNDEF(NVT), &MaskVec[0]);
14503 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14504 DAG.getIntPtrConstant(0));
14505}
14506
14507SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14508 SelectionDAG &DAG) const {
14509 assert(!Op.getSimpleValueType().isVector())((!Op.getSimpleValueType().isVector()) ? static_cast<void>
(0) : __assert_fail ("!Op.getSimpleValueType().isVector()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14509, __PRETTY_FUNCTION__))
;
14510
14511 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14512 /*IsSigned=*/ true, /*IsReplace=*/ false);
14513 SDValue FIST = Vals.first, StackSlot = Vals.second;
14514 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14515 if (!FIST.getNode()) return Op;
14516
14517 if (StackSlot.getNode())
14518 // Load the result.
14519 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14520 FIST, StackSlot, MachinePointerInfo(),
14521 false, false, false, 0);
14522
14523 // The node is the result.
14524 return FIST;
14525}
14526
14527SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14528 SelectionDAG &DAG) const {
14529 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14530 /*IsSigned=*/ false, /*IsReplace=*/ false);
14531 SDValue FIST = Vals.first, StackSlot = Vals.second;
14532 assert(FIST.getNode() && "Unexpected failure")((FIST.getNode() && "Unexpected failure") ? static_cast
<void> (0) : __assert_fail ("FIST.getNode() && \"Unexpected failure\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14532, __PRETTY_FUNCTION__))
;
14533
14534 if (StackSlot.getNode())
14535 // Load the result.
14536 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14537 FIST, StackSlot, MachinePointerInfo(),
14538 false, false, false, 0);
14539
14540 // The node is the result.
14541 return FIST;
14542}
14543
14544static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14545 SDLoc DL(Op);
14546 MVT VT = Op.getSimpleValueType();
14547 SDValue In = Op.getOperand(0);
14548 MVT SVT = In.getSimpleValueType();
14549
14550 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14550, __PRETTY_FUNCTION__))
;
14551
14552 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14553 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14554 In, DAG.getUNDEF(SVT)));
14555}
14556
14557/// The only differences between FABS and FNEG are the mask and the logic op.
14558/// FNEG also has a folding opportunity for FNEG(FABS(x)).
14559static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14560 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14561, __PRETTY_FUNCTION__))
14561 "Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14561, __PRETTY_FUNCTION__))
;
14562
14563 bool IsFABS = (Op.getOpcode() == ISD::FABS);
14564
14565 // If this is a FABS and it has an FNEG user, bail out to fold the combination
14566 // into an FNABS. We'll lower the FABS after that if it is still in use.
14567 if (IsFABS)
14568 for (SDNode *User : Op->uses())
14569 if (User->getOpcode() == ISD::FNEG)
14570 return Op;
14571
14572 SDValue Op0 = Op.getOperand(0);
14573 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14574
14575 SDLoc dl(Op);
14576 MVT VT = Op.getSimpleValueType();
14577 // Assume scalar op for initialization; update for vector if needed.
14578 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14579 // generate a 16-byte vector constant and logic op even for the scalar case.
14580 // Using a 16-byte mask allows folding the load of the mask with
14581 // the logic op, so it can save (~4 bytes) on code size.
14582 MVT EltVT = VT;
14583 unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14584 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14585 // decide if we should generate a 16-byte constant mask when we only need 4 or
14586 // 8 bytes for the scalar case.
14587 if (VT.isVector()) {
14588 EltVT = VT.getVectorElementType();
14589 NumElts = VT.getVectorNumElements();
14590 }
14591
14592 unsigned EltBits = EltVT.getSizeInBits();
14593 LLVMContext *Context = DAG.getContext();
14594 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14595 APInt MaskElt =
14596 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14597 Constant *C = ConstantInt::get(*Context, MaskElt);
14598 C = ConstantVector::getSplat(NumElts, C);
14599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14600 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14601 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14602 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14603 MachinePointerInfo::getConstantPool(),
14604 false, false, false, Alignment);
14605
14606 if (VT.isVector()) {
14607 // For a vector, cast operands to a vector type, perform the logic op,
14608 // and cast the result back to the original value type.
14609 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14610 SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14611 SDValue Operand = IsFNABS ?
14612 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14613 DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14614 unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14615 return DAG.getNode(ISD::BITCAST, dl, VT,
14616 DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14617 }
14618
14619 // If not vector, then scalar.
14620 unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14621 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14622 return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14623}
14624
14625static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14627 LLVMContext *Context = DAG.getContext();
14628 SDValue Op0 = Op.getOperand(0);
14629 SDValue Op1 = Op.getOperand(1);
14630 SDLoc dl(Op);
14631 MVT VT = Op.getSimpleValueType();
14632 MVT SrcVT = Op1.getSimpleValueType();
14633
14634 // If second operand is smaller, extend it first.
14635 if (SrcVT.bitsLT(VT)) {
14636 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14637 SrcVT = VT;
14638 }
14639 // And if it is bigger, shrink it first.
14640 if (SrcVT.bitsGT(VT)) {
14641 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14642 SrcVT = VT;
14643 }
14644
14645 // At this point the operands and the result should have the same
14646 // type, and that won't be f80 since that is not custom lowered.
14647
14648 const fltSemantics &Sem =
14649 VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14650 const unsigned SizeInBits = VT.getSizeInBits();
14651
14652 SmallVector<Constant *, 4> CV(
14653 VT == MVT::f64 ? 2 : 4,
14654 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14655
14656 // First, clear all bits but the sign bit from the second operand (sign).
14657 CV[0] = ConstantFP::get(*Context,
14658 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14659 Constant *C = ConstantVector::get(CV);
14660 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14661 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14662 MachinePointerInfo::getConstantPool(),
14663 false, false, false, 16);
14664 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14665
14666 // Next, clear the sign bit from the first operand (magnitude).
14667 // If it's a constant, we can clear it here.
14668 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14669 APFloat APF = Op0CN->getValueAPF();
14670 // If the magnitude is a positive zero, the sign bit alone is enough.
14671 if (APF.isPosZero())
14672 return SignBit;
14673 APF.clearSign();
14674 CV[0] = ConstantFP::get(*Context, APF);
14675 } else {
14676 CV[0] = ConstantFP::get(
14677 *Context,
14678 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14679 }
14680 C = ConstantVector::get(CV);
14681 CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14682 SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14683 MachinePointerInfo::getConstantPool(),
14684 false, false, false, 16);
14685 // If the magnitude operand wasn't a constant, we need to AND out the sign.
14686 if (!isa<ConstantFPSDNode>(Op0))
14687 Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14688
14689 // OR the magnitude value with the sign bit.
14690 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14691}
14692
14693static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14694 SDValue N0 = Op.getOperand(0);
14695 SDLoc dl(Op);
14696 MVT VT = Op.getSimpleValueType();
14697
14698 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14699 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14700 DAG.getConstant(1, VT));
14701 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14702}
14703
14704// Check whether an OR'd tree is PTEST-able.
14705static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14706 SelectionDAG &DAG) {
14707 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree."
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14707, __PRETTY_FUNCTION__))
;
14708
14709 if (!Subtarget->hasSSE41())
14710 return SDValue();
14711
14712 if (!Op->hasOneUse())
14713 return SDValue();
14714
14715 SDNode *N = Op.getNode();
14716 SDLoc DL(N);
14717
14718 SmallVector<SDValue, 8> Opnds;
14719 DenseMap<SDValue, unsigned> VecInMap;
14720 SmallVector<SDValue, 8> VecIns;
14721 EVT VT = MVT::Other;
14722
14723 // Recognize a special case where a vector is casted into wide integer to
14724 // test all 0s.
14725 Opnds.push_back(N->getOperand(0));
14726 Opnds.push_back(N->getOperand(1));
14727
14728 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14729 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14730 // BFS traverse all OR'd operands.
14731 if (I->getOpcode() == ISD::OR) {
14732 Opnds.push_back(I->getOperand(0));
14733 Opnds.push_back(I->getOperand(1));
14734 // Re-evaluate the number of nodes to be traversed.
14735 e += 2; // 2 more nodes (LHS and RHS) are pushed.
14736 continue;
14737 }
14738
14739 // Quit if a non-EXTRACT_VECTOR_ELT
14740 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14741 return SDValue();
14742
14743 // Quit if without a constant index.
14744 SDValue Idx = I->getOperand(1);
14745 if (!isa<ConstantSDNode>(Idx))
14746 return SDValue();
14747
14748 SDValue ExtractedFromVec = I->getOperand(0);
14749 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14750 if (M == VecInMap.end()) {
14751 VT = ExtractedFromVec.getValueType();
14752 // Quit if not 128/256-bit vector.
14753 if (!VT.is128BitVector() && !VT.is256BitVector())
14754 return SDValue();
14755 // Quit if not the same type.
14756 if (VecInMap.begin() != VecInMap.end() &&
14757 VT != VecInMap.begin()->first.getValueType())
14758 return SDValue();
14759 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14760 VecIns.push_back(ExtractedFromVec);
14761 }
14762 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14763 }
14764
14765 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14766, __PRETTY_FUNCTION__))
14766 "Not extracted from 128-/256-bit vector.")(((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14766, __PRETTY_FUNCTION__))
;
14767
14768 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14769
14770 for (DenseMap<SDValue, unsigned>::const_iterator
14771 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14772 // Quit if not all elements are used.
14773 if (I->second != FullMask)
14774 return SDValue();
14775 }
14776
14777 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14778
14779 // Cast all vectors into TestVT for PTEST.
14780 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14781 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
14782
14783 // If more than one full vectors are evaluated, OR them first before PTEST.
14784 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14785 // Each iteration will OR 2 nodes and append the result until there is only
14786 // 1 node left, i.e. the final OR'd value of all vectors.
14787 SDValue LHS = VecIns[Slot];
14788 SDValue RHS = VecIns[Slot + 1];
14789 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14790 }
14791
14792 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14793 VecIns.back(), VecIns.back());
14794}
14795
14796/// \brief return true if \c Op has a use that doesn't just read flags.
14797static bool hasNonFlagsUse(SDValue Op) {
14798 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14799 ++UI) {
14800 SDNode *User = *UI;
14801 unsigned UOpNo = UI.getOperandNo();
14802 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14803 // Look pass truncate.
14804 UOpNo = User->use_begin().getOperandNo();
14805 User = *User->use_begin();
14806 }
14807
14808 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14809 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14810 return true;
14811 }
14812 return false;
14813}
14814
14815/// Emit nodes that will be selected as "test Op0,Op0", or something
14816/// equivalent.
14817SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
14818 SelectionDAG &DAG) const {
14819 if (Op.getValueType() == MVT::i1)
14820 // KORTEST instruction should be selected
14821 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14822 DAG.getConstant(0, Op.getValueType()));
14823
14824 // CF and OF aren't always set the way we want. Determine which
14825 // of these we need.
14826 bool NeedCF = false;
14827 bool NeedOF = false;
14828 switch (X86CC) {
14829 default: break;
14830 case X86::COND_A: case X86::COND_AE:
14831 case X86::COND_B: case X86::COND_BE:
14832 NeedCF = true;
14833 break;
14834 case X86::COND_G: case X86::COND_GE:
14835 case X86::COND_L: case X86::COND_LE:
14836 case X86::COND_O: case X86::COND_NO: {
14837 // Check if we really need to set the
14838 // Overflow flag. If NoSignedWrap is present
14839 // that is not actually needed.
14840 switch (Op->getOpcode()) {
14841 case ISD::ADD:
14842 case ISD::SUB:
14843 case ISD::MUL:
14844 case ISD::SHL: {
14845 const BinaryWithFlagsSDNode *BinNode =
14846 cast<BinaryWithFlagsSDNode>(Op.getNode());
14847 if (BinNode->hasNoSignedWrap())
14848 break;
14849 }
14850 default:
14851 NeedOF = true;
14852 break;
14853 }
14854 break;
14855 }
14856 }
14857 // See if we can use the EFLAGS value from the operand instead of
14858 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14859 // we prove that the arithmetic won't overflow, we can't use OF or CF.
14860 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14861 // Emit a CMP with 0, which is the TEST pattern.
14862 //if (Op.getValueType() == MVT::i1)
14863 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
14864 // DAG.getConstant(0, MVT::i1));
14865 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14866 DAG.getConstant(0, Op.getValueType()));
14867 }
14868 unsigned Opcode = 0;
14869 unsigned NumOperands = 0;
14870
14871 // Truncate operations may prevent the merge of the SETCC instruction
14872 // and the arithmetic instruction before it. Attempt to truncate the operands
14873 // of the arithmetic instruction and use a reduced bit-width instruction.
14874 bool NeedTruncation = false;
14875 SDValue ArithOp = Op;
14876 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14877 SDValue Arith = Op->getOperand(0);
14878 // Both the trunc and the arithmetic op need to have one user each.
14879 if (Arith->hasOneUse())
14880 switch (Arith.getOpcode()) {
14881 default: break;
14882 case ISD::ADD:
14883 case ISD::SUB:
14884 case ISD::AND:
14885 case ISD::OR:
14886 case ISD::XOR: {
14887 NeedTruncation = true;
14888 ArithOp = Arith;
14889 }
14890 }
14891 }
14892
14893 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14894 // which may be the result of a CAST. We use the variable 'Op', which is the
14895 // non-casted variable when we check for possible users.
14896 switch (ArithOp.getOpcode()) {
14897 case ISD::ADD:
14898 // Due to an isel shortcoming, be conservative if this add is likely to be
14899 // selected as part of a load-modify-store instruction. When the root node
14900 // in a match is a store, isel doesn't know how to remap non-chain non-flag
14901 // uses of other nodes in the match, such as the ADD in this case. This
14902 // leads to the ADD being left around and reselected, with the result being
14903 // two adds in the output. Alas, even if none our users are stores, that
14904 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
14905 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
14906 // climbing the DAG back to the root, and it doesn't seem to be worth the
14907 // effort.
14908 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14909 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14910 if (UI->getOpcode() != ISD::CopyToReg &&
14911 UI->getOpcode() != ISD::SETCC &&
14912 UI->getOpcode() != ISD::STORE)
14913 goto default_case;
14914
14915 if (ConstantSDNode *C =
14916 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14917 // An add of one will be selected as an INC.
14918 if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
14919 Opcode = X86ISD::INC;
14920 NumOperands = 1;
14921 break;
14922 }
14923
14924 // An add of negative one (subtract of one) will be selected as a DEC.
14925 if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
14926 Opcode = X86ISD::DEC;
14927 NumOperands = 1;
14928 break;
14929 }
14930 }
14931
14932 // Otherwise use a regular EFLAGS-setting add.
14933 Opcode = X86ISD::ADD;
14934 NumOperands = 2;
14935 break;
14936 case ISD::SHL:
14937 case ISD::SRL:
14938 // If we have a constant logical shift that's only used in a comparison
14939 // against zero turn it into an equivalent AND. This allows turning it into
14940 // a TEST instruction later.
14941 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14942 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14943 EVT VT = Op.getValueType();
14944 unsigned BitWidth = VT.getSizeInBits();
14945 unsigned ShAmt = Op->getConstantOperandVal(1);
14946 if (ShAmt >= BitWidth) // Avoid undefined shifts.
14947 break;
14948 APInt Mask = ArithOp.getOpcode() == ISD::SRL
14949 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14950 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14951 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14952 break;
14953 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14954 DAG.getConstant(Mask, VT));
14955 DAG.ReplaceAllUsesWith(Op, New);
14956 Op = New;
14957 }
14958 break;
14959
14960 case ISD::AND:
14961 // If the primary and result isn't used, don't bother using X86ISD::AND,
14962 // because a TEST instruction will be better.
14963 if (!hasNonFlagsUse(Op))
14964 break;
14965 // FALL THROUGH
14966 case ISD::SUB:
14967 case ISD::OR:
14968 case ISD::XOR:
14969 // Due to the ISEL shortcoming noted above, be conservative if this op is
14970 // likely to be selected as part of a load-modify-store instruction.
14971 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14972 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14973 if (UI->getOpcode() == ISD::STORE)
14974 goto default_case;
14975
14976 // Otherwise use a regular EFLAGS-setting instruction.
14977 switch (ArithOp.getOpcode()) {
14978 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 14978)
;
14979 case ISD::SUB: Opcode = X86ISD::SUB; break;
14980 case ISD::XOR: Opcode = X86ISD::XOR; break;
14981 case ISD::AND: Opcode = X86ISD::AND; break;
14982 case ISD::OR: {
14983 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14984 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
14985 if (EFLAGS.getNode())
14986 return EFLAGS;
14987 }
14988 Opcode = X86ISD::OR;
14989 break;
14990 }
14991 }
14992
14993 NumOperands = 2;
14994 break;
14995 case X86ISD::ADD:
14996 case X86ISD::SUB:
14997 case X86ISD::INC:
14998 case X86ISD::DEC:
14999 case X86ISD::OR:
15000 case X86ISD::XOR:
15001 case X86ISD::AND:
15002 return SDValue(Op.getNode(), 1);
15003 default:
15004 default_case:
15005 break;
15006 }
15007
15008 // If we found that truncation is beneficial, perform the truncation and
15009 // update 'Op'.
15010 if (NeedTruncation) {
15011 EVT VT = Op.getValueType();
15012 SDValue WideVal = Op->getOperand(0);
15013 EVT WideVT = WideVal.getValueType();
15014 unsigned ConvertedOp = 0;
15015 // Use a target machine opcode to prevent further DAGCombine
15016 // optimizations that may separate the arithmetic operations
15017 // from the setcc node.
15018 switch (WideVal.getOpcode()) {
15019 default: break;
15020 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15021 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15022 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15023 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15024 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15025 }
15026
15027 if (ConvertedOp) {
15028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15029 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15030 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15031 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15032 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15033 }
15034 }
15035 }
15036
15037 if (Opcode == 0)
15038 // Emit a CMP with 0, which is the TEST pattern.
15039 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15040 DAG.getConstant(0, Op.getValueType()));
15041
15042 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15043 SmallVector<SDValue, 4> Ops;
15044 for (unsigned i = 0; i != NumOperands; ++i)
15045 Ops.push_back(Op.getOperand(i));
15046
15047 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15048 DAG.ReplaceAllUsesWith(Op, New);
15049 return SDValue(New.getNode(), 1);
15050}
15051
15052/// Emit nodes that will be selected as "cmp Op0,Op1", or something
15053/// equivalent.
15054SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15055 SDLoc dl, SelectionDAG &DAG) const {
15056 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15057 if (C->getAPIntValue() == 0)
15058 return EmitTest(Op0, X86CC, dl, DAG);
15059
15060 if (Op0.getValueType() == MVT::i1)
15061 llvm_unreachable("Unexpected comparison operation for MVT::i1 operands")::llvm::llvm_unreachable_internal("Unexpected comparison operation for MVT::i1 operands"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15061)
;
15062 }
15063
15064 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15065 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15066 // Do the comparison at i32 if it's smaller, besides the Atom case.
15067 // This avoids subregister aliasing issues. Keep the smaller reference
15068 // if we're optimizing for size, however, as that'll allow better folding
15069 // of memory operations.
15070 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15071 !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15072 AttributeSet::FunctionIndex, Attribute::MinSize) &&
15073 !Subtarget->isAtom()) {
15074 unsigned ExtendOp =
15075 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15076 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15077 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15078 }
15079 // Use SUB instead of CMP to enable CSE between SUB and CMP.
15080 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15081 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15082 Op0, Op1);
15083 return SDValue(Sub.getNode(), 1);
15084 }
15085 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15086}
15087
15088/// Convert a comparison if required by the subtarget.
15089SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15090 SelectionDAG &DAG) const {
15091 // If the subtarget does not support the FUCOMI instruction, floating-point
15092 // comparisons have to be converted.
15093 if (Subtarget->hasCMov() ||
15094 Cmp.getOpcode() != X86ISD::CMP ||
15095 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15096 !Cmp.getOperand(1).getValueType().isFloatingPoint())
15097 return Cmp;
15098
15099 // The instruction selector will select an FUCOM instruction instead of
15100 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15101 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15102 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15103 SDLoc dl(Cmp);
15104 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15105 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15106 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15107 DAG.getConstant(8, MVT::i8));
15108 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15109 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15110}
15111
15112/// The minimum architected relative accuracy is 2^-12. We need one
15113/// Newton-Raphson step to have a good float result (24 bits of precision).
15114SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15115 DAGCombinerInfo &DCI,
15116 unsigned &RefinementSteps,
15117 bool &UseOneConstNR) const {
15118 // FIXME: We should use instruction latency models to calculate the cost of
15119 // each potential sequence, but this is very hard to do reliably because
15120 // at least Intel's Core* chips have variable timing based on the number of
15121 // significant digits in the divisor and/or sqrt operand.
15122 if (!Subtarget->useSqrtEst())
15123 return SDValue();
15124
15125 EVT VT = Op.getValueType();
15126
15127 // SSE1 has rsqrtss and rsqrtps.
15128 // TODO: Add support for AVX512 (v16f32).
15129 // It is likely not profitable to do this for f64 because a double-precision
15130 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15131 // instructions: convert to single, rsqrtss, convert back to double, refine
15132 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15133 // along with FMA, this could be a throughput win.
15134 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15135 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15136 RefinementSteps = 1;
15137 UseOneConstNR = false;
15138 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15139 }
15140 return SDValue();
15141}
15142
15143/// The minimum architected relative accuracy is 2^-12. We need one
15144/// Newton-Raphson step to have a good float result (24 bits of precision).
15145SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15146 DAGCombinerInfo &DCI,
15147 unsigned &RefinementSteps) const {
15148 // FIXME: We should use instruction latency models to calculate the cost of
15149 // each potential sequence, but this is very hard to do reliably because
15150 // at least Intel's Core* chips have variable timing based on the number of
15151 // significant digits in the divisor.
15152 if (!Subtarget->useReciprocalEst())
15153 return SDValue();
15154
15155 EVT VT = Op.getValueType();
15156
15157 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15158 // TODO: Add support for AVX512 (v16f32).
15159 // It is likely not profitable to do this for f64 because a double-precision
15160 // reciprocal estimate with refinement on x86 prior to FMA requires
15161 // 15 instructions: convert to single, rcpss, convert back to double, refine
15162 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15163 // along with FMA, this could be a throughput win.
15164 if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15165 (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15166 RefinementSteps = ReciprocalEstimateRefinementSteps;
15167 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15168 }
15169 return SDValue();
15170}
15171
15172static bool isAllOnes(SDValue V) {
15173 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15174 return C && C->isAllOnesValue();
15175}
15176
15177/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15178/// if it's possible.
15179SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15180 SDLoc dl, SelectionDAG &DAG) const {
15181 SDValue Op0 = And.getOperand(0);
15182 SDValue Op1 = And.getOperand(1);
15183 if (Op0.getOpcode() == ISD::TRUNCATE)
15184 Op0 = Op0.getOperand(0);
15185 if (Op1.getOpcode() == ISD::TRUNCATE)
15186 Op1 = Op1.getOperand(0);
15187
15188 SDValue LHS, RHS;
15189 if (Op1.getOpcode() == ISD::SHL)
15190 std::swap(Op0, Op1);
15191 if (Op0.getOpcode() == ISD::SHL) {
15192 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15193 if (And00C->getZExtValue() == 1) {
15194 // If we looked past a truncate, check that it's only truncating away
15195 // known zeros.
15196 unsigned BitWidth = Op0.getValueSizeInBits();
15197 unsigned AndBitWidth = And.getValueSizeInBits();
15198 if (BitWidth > AndBitWidth) {
15199 APInt Zeros, Ones;
15200 DAG.computeKnownBits(Op0, Zeros, Ones);
15201 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15202 return SDValue();
15203 }
15204 LHS = Op1;
15205 RHS = Op0.getOperand(1);
15206 }
15207 } else if (Op1.getOpcode() == ISD::Constant) {
15208 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15209 uint64_t AndRHSVal = AndRHS->getZExtValue();
15210 SDValue AndLHS = Op0;
15211
15212 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15213 LHS = AndLHS.getOperand(0);
15214 RHS = AndLHS.getOperand(1);
15215 }
15216
15217 // Use BT if the immediate can't be encoded in a TEST instruction.
15218 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15219 LHS = AndLHS;
15220 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15221 }
15222 }
15223
15224 if (LHS.getNode()) {
15225 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
15226 // instruction. Since the shift amount is in-range-or-undefined, we know
15227 // that doing a bittest on the i32 value is ok. We extend to i32 because
15228 // the encoding for the i16 version is larger than the i32 version.
15229 // Also promote i16 to i32 for performance / code size reason.
15230 if (LHS.getValueType() == MVT::i8 ||
15231 LHS.getValueType() == MVT::i16)
15232 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15233
15234 // If the operand types disagree, extend the shift amount to match. Since
15235 // BT ignores high bits (like shifts) we can use anyextend.
15236 if (LHS.getValueType() != RHS.getValueType())
15237 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15238
15239 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15240 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15241 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15242 DAG.getConstant(Cond, MVT::i8), BT);
15243 }
15244
15245 return SDValue();
15246}
15247
15248/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15249/// mask CMPs.
15250static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15251 SDValue &Op1) {
15252 unsigned SSECC;
15253 bool Swap = false;
15254
15255 // SSE Condition code mapping:
15256 // 0 - EQ
15257 // 1 - LT
15258 // 2 - LE
15259 // 3 - UNORD
15260 // 4 - NEQ
15261 // 5 - NLT
15262 // 6 - NLE
15263 // 7 - ORD
15264 switch (SetCCOpcode) {
15265 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15265)
;
15266 case ISD::SETOEQ:
15267 case ISD::SETEQ: SSECC = 0; break;
15268 case ISD::SETOGT:
15269 case ISD::SETGT: Swap = true; // Fallthrough
15270 case ISD::SETLT:
15271 case ISD::SETOLT: SSECC = 1; break;
15272 case ISD::SETOGE:
15273 case ISD::SETGE: Swap = true; // Fallthrough
15274 case ISD::SETLE:
15275 case ISD::SETOLE: SSECC = 2; break;
15276 case ISD::SETUO: SSECC = 3; break;
15277 case ISD::SETUNE:
15278 case ISD::SETNE: SSECC = 4; break;
15279 case ISD::SETULE: Swap = true; // Fallthrough
15280 case ISD::SETUGE: SSECC = 5; break;
15281 case ISD::SETULT: Swap = true; // Fallthrough
15282 case ISD::SETUGT: SSECC = 6; break;
15283 case ISD::SETO: SSECC = 7; break;
15284 case ISD::SETUEQ:
15285 case ISD::SETONE: SSECC = 8; break;
15286 }
15287 if (Swap)
15288 std::swap(Op0, Op1);
15289
15290 return SSECC;
15291}
15292
15293// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15294// ones, and then concatenate the result back.
15295static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15296 MVT VT = Op.getSimpleValueType();
15297
15298 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15299, __PRETTY_FUNCTION__))
15299 "Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15299, __PRETTY_FUNCTION__))
;
15300
15301 unsigned NumElems = VT.getVectorNumElements();
15302 SDLoc dl(Op);
15303 SDValue CC = Op.getOperand(2);
15304
15305 // Extract the LHS vectors
15306 SDValue LHS = Op.getOperand(0);
15307 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15308 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15309
15310 // Extract the RHS vectors
15311 SDValue RHS = Op.getOperand(1);
15312 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15313 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15314
15315 // Issue the operation on the smaller types and concatenate the result back
15316 MVT EltVT = VT.getVectorElementType();
15317 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15318 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15319 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15320 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15321}
15322
15323static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15324 const X86Subtarget *Subtarget) {
15325 SDValue Op0 = Op.getOperand(0);
15326 SDValue Op1 = Op.getOperand(1);
15327 SDValue CC = Op.getOperand(2);
15328 MVT VT = Op.getSimpleValueType();
15329 SDLoc dl(Op);
15330
15331 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__))
15332 Op.getValueType().getScalarType() == MVT::i1 &&((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__))
15333 "Cannot set masked compare for this operation")((Op0.getValueType().getVectorElementType().getSizeInBits() >=
8 && Op.getValueType().getScalarType() == MVT::i1 &&
"Cannot set masked compare for this operation") ? static_cast
<void> (0) : __assert_fail ("Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15333, __PRETTY_FUNCTION__))
;
15334
15335 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15336 unsigned Opc = 0;
15337 bool Unsigned = false;
15338 bool Swap = false;
15339 unsigned SSECC;
15340 switch (SetCCOpcode) {
15341 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15341)
;
15342 case ISD::SETNE: SSECC = 4; break;
15343 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
15344 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15345 case ISD::SETLT: Swap = true; //fall-through
15346 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
15347 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15348 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15349 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
15350 case ISD::SETULE: Unsigned = true; //fall-through
15351 case ISD::SETLE: SSECC = 2; break;
15352 }
15353
15354 if (Swap)
15355 std::swap(Op0, Op1);
15356 if (Opc)
15357 return DAG.getNode(Opc, dl, VT, Op0, Op1);
15358 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15359 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15360 DAG.getConstant(SSECC, MVT::i8));
15361}
15362
15363/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15364/// operand \p Op1. If non-trivial (for example because it's not constant)
15365/// return an empty value.
15366static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15367{
15368 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15369 if (!BV)
15370 return SDValue();
15371
15372 MVT VT = Op1.getSimpleValueType();
15373 MVT EVT = VT.getVectorElementType();
15374 unsigned n = VT.getVectorNumElements();
15375 SmallVector<SDValue, 8> ULTOp1;
15376
15377 for (unsigned i = 0; i < n; ++i) {
15378 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15379 if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15380 return SDValue();
15381
15382 // Avoid underflow.
15383 APInt Val = Elt->getAPIntValue();
15384 if (Val == 0)
15385 return SDValue();
15386
15387 ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15388 }
15389
15390 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15391}
15392
15393static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15394 SelectionDAG &DAG) {
15395 SDValue Op0 = Op.getOperand(0);
15396 SDValue Op1 = Op.getOperand(1);
15397 SDValue CC = Op.getOperand(2);
15398 MVT VT = Op.getSimpleValueType();
15399 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15400 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15401 SDLoc dl(Op);
15402
15403 if (isFP) {
15404#ifndef NDEBUG
15405 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15406 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15406, __PRETTY_FUNCTION__))
;
15407#endif
15408
15409 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15410 unsigned Opc = X86ISD::CMPP;
15411 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15412 assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15412, __PRETTY_FUNCTION__))
;
15413 Opc = X86ISD::CMPM;
15414 }
15415 // In the two special cases we can't handle, emit two comparisons.
15416 if (SSECC == 8) {
15417 unsigned CC0, CC1;
15418 unsigned CombineOpc;
15419 if (SetCCOpcode == ISD::SETUEQ) {
15420 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15421 } else {
15422 assert(SetCCOpcode == ISD::SETONE)((SetCCOpcode == ISD::SETONE) ? static_cast<void> (0) :
__assert_fail ("SetCCOpcode == ISD::SETONE", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15422, __PRETTY_FUNCTION__))
;
15423 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15424 }
15425
15426 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15427 DAG.getConstant(CC0, MVT::i8));
15428 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15429 DAG.getConstant(CC1, MVT::i8));
15430 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15431 }
15432 // Handle all other FP comparisons here.
15433 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15434 DAG.getConstant(SSECC, MVT::i8));
15435 }
15436
15437 // Break 256-bit integer vector compare into smaller ones.
15438 if (VT.is256BitVector() && !Subtarget->hasInt256())
15439 return Lower256IntVSETCC(Op, DAG);
15440
15441 bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15442 EVT OpVT = Op1.getValueType();
15443 if (Subtarget->hasAVX512()) {
15444 if (Op1.getValueType().is512BitVector() ||
15445 (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15446 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15447 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15448
15449 // In AVX-512 architecture setcc returns mask with i1 elements,
15450 // But there is no compare instruction for i8 and i16 elements in KNL.
15451 // We are not talking about 512-bit operands in this case, these
15452 // types are illegal.
15453 if (MaskResult &&
15454 (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15455 OpVT.getVectorElementType().getSizeInBits() >= 8))
15456 return DAG.getNode(ISD::TRUNCATE, dl, VT,
15457 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15458 }
15459
15460 // We are handling one of the integer comparisons here. Since SSE only has
15461 // GT and EQ comparisons for integer, swapping operands and multiple
15462 // operations may be required for some comparisons.
15463 unsigned Opc;
15464 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15465 bool Subus = false;
15466
15467 switch (SetCCOpcode) {
15468 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15468)
;
15469 case ISD::SETNE: Invert = true;
15470 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
15471 case ISD::SETLT: Swap = true;
15472 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
15473 case ISD::SETGE: Swap = true;
15474 case ISD::SETLE: Opc = X86ISD::PCMPGT;
15475 Invert = true; break;
15476 case ISD::SETULT: Swap = true;
15477 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15478 FlipSigns = true; break;
15479 case ISD::SETUGE: Swap = true;
15480 case ISD::SETULE: Opc = X86ISD::PCMPGT;
15481 FlipSigns = true; Invert = true; break;
15482 }
15483
15484 // Special case: Use min/max operations for SETULE/SETUGE
15485 MVT VET = VT.getVectorElementType();
15486 bool hasMinMax =
15487 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15488 || (Subtarget->hasSSE2() && (VET == MVT::i8));
15489
15490 if (hasMinMax) {
15491 switch (SetCCOpcode) {
15492 default: break;
15493 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15494 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15495 }
15496
15497 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15498 }
15499
15500 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15501 if (!MinMax && hasSubus) {
15502 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15503 // Op0 u<= Op1:
15504 // t = psubus Op0, Op1
15505 // pcmpeq t, <0..0>
15506 switch (SetCCOpcode) {
15507 default: break;
15508 case ISD::SETULT: {
15509 // If the comparison is against a constant we can turn this into a
15510 // setule. With psubus, setule does not require a swap. This is
15511 // beneficial because the constant in the register is no longer
15512 // destructed as the destination so it can be hoisted out of a loop.
15513 // Only do this pre-AVX since vpcmp* is no longer destructive.
15514 if (Subtarget->hasAVX())
15515 break;
15516 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15517 if (ULEOp1.getNode()) {
15518 Op1 = ULEOp1;
15519 Subus = true; Invert = false; Swap = false;
15520 }
15521 break;
15522 }
15523 // Psubus is better than flip-sign because it requires no inversion.
15524 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
15525 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15526 }
15527
15528 if (Subus) {
15529 Opc = X86ISD::SUBUS;
15530 FlipSigns = false;
15531 }
15532 }
15533
15534 if (Swap)
15535 std::swap(Op0, Op1);
15536
15537 // Check that the operation in question is available (most are plain SSE2,
15538 // but PCMPGTQ and PCMPEQQ have different requirements).
15539 if (VT == MVT::v2i64) {
15540 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15541 assert(Subtarget->hasSSE2() && "Don't know how to lower!")((Subtarget->hasSSE2() && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Don't know how to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15541, __PRETTY_FUNCTION__))
;
15542
15543 // First cast everything to the right type.
15544 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15545 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15546
15547 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15548 // bits of the inputs before performing those operations. The lower
15549 // compare is always unsigned.
15550 SDValue SB;
15551 if (FlipSigns) {
15552 SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15553 } else {
15554 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15555 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15556 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15557 Sign, Zero, Sign, Zero);
15558 }
15559 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15560 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15561
15562 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15563 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15564 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15565
15566 // Create masks for only the low parts/high parts of the 64 bit integers.
15567 static const int MaskHi[] = { 1, 1, 3, 3 };
15568 static const int MaskLo[] = { 0, 0, 2, 2 };
15569 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15570 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15571 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15572
15573 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15574 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15575
15576 if (Invert)
15577 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15578
15579 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15580 }
15581
15582 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15583 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15584 // pcmpeqd + pshufd + pand.
15585 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15585, __PRETTY_FUNCTION__))
;
15586
15587 // First cast everything to the right type.
15588 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15589 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15590
15591 // Do the compare.
15592 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15593
15594 // Make sure the lower and upper halves are both all-ones.
15595 static const int Mask[] = { 1, 0, 3, 2 };
15596 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15597 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15598
15599 if (Invert)
15600 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15601
15602 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15603 }
15604 }
15605
15606 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15607 // bits of the inputs before performing those operations.
15608 if (FlipSigns) {
15609 EVT EltVT = VT.getVectorElementType();
15610 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15611 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15612 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15613 }
15614
15615 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15616
15617 // If the logical-not of the result is required, perform that now.
15618 if (Invert)
15619 Result = DAG.getNOT(dl, Result, VT);
15620
15621 if (MinMax)
15622 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15623
15624 if (Subus)
15625 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15626 getZeroVector(VT, Subtarget, DAG, dl));
15627
15628 return Result;
15629}
15630
15631SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15632
15633 MVT VT = Op.getSimpleValueType();
15634
15635 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15636
15637 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))((((!Subtarget->hasAVX512() && VT == MVT::i8) || (
VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"
) ? static_cast<void> (0) : __assert_fail ("((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && \"SetCC type must be 8-bit or 1-bit integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15638, __PRETTY_FUNCTION__))
15638 && "SetCC type must be 8-bit or 1-bit integer")((((!Subtarget->hasAVX512() && VT == MVT::i8) || (
VT == MVT::i1)) && "SetCC type must be 8-bit or 1-bit integer"
) ? static_cast<void> (0) : __assert_fail ("((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) && \"SetCC type must be 8-bit or 1-bit integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15638, __PRETTY_FUNCTION__))
;
15639 SDValue Op0 = Op.getOperand(0);
15640 SDValue Op1 = Op.getOperand(1);
15641 SDLoc dl(Op);
15642 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15643
15644 // Optimize to BT if possible.
15645 // Lower (X & (1 << N)) == 0 to BT(X, N).
15646 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15647 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15648 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15649 Op1.getOpcode() == ISD::Constant &&
15650 cast<ConstantSDNode>(Op1)->isNullValue() &&
15651 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15652 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15653 if (NewSetCC.getNode()) {
15654 if (VT == MVT::i1)
15655 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15656 return NewSetCC;
15657 }
15658 }
15659
15660 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
15661 // these.
15662 if (Op1.getOpcode() == ISD::Constant &&
15663 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15664 cast<ConstantSDNode>(Op1)->isNullValue()) &&
15665 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15666
15667 // If the input is a setcc, then reuse the input setcc or use a new one with
15668 // the inverted condition.
15669 if (Op0.getOpcode() == X86ISD::SETCC) {
15670 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15671 bool Invert = (CC == ISD::SETNE) ^
15672 cast<ConstantSDNode>(Op1)->isNullValue();
15673 if (!Invert)
15674 return Op0;
15675
15676 CCode = X86::GetOppositeBranchCondition(CCode);
15677 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15678 DAG.getConstant(CCode, MVT::i8),
15679 Op0.getOperand(1));
15680 if (VT == MVT::i1)
15681 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15682 return SetCC;
15683 }
15684 }
15685 if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15686 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15687 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15688
15689 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15690 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15691 }
15692
15693 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15694 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15695 if (X86CC == X86::COND_INVALID)
15696 return SDValue();
15697
15698 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15699 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15700 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15701 DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15702 if (VT == MVT::i1)
15703 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15704 return SetCC;
15705}
15706
15707// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15708static bool isX86LogicalCmp(SDValue Op) {
15709 unsigned Opc = Op.getNode()->getOpcode();
15710 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15711 Opc == X86ISD::SAHF)
15712 return true;
15713 if (Op.getResNo() == 1 &&
15714 (Opc == X86ISD::ADD ||
15715 Opc == X86ISD::SUB ||
15716 Opc == X86ISD::ADC ||
15717 Opc == X86ISD::SBB ||
15718 Opc == X86ISD::SMUL ||
15719 Opc == X86ISD::UMUL ||
15720 Opc == X86ISD::INC ||
15721 Opc == X86ISD::DEC ||
15722 Opc == X86ISD::OR ||
15723 Opc == X86ISD::XOR ||
15724 Opc == X86ISD::AND))
15725 return true;
15726
15727 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15728 return true;
15729
15730 return false;
15731}
15732
15733static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15734 if (V.getOpcode() != ISD::TRUNCATE)
15735 return false;
15736
15737 SDValue VOp0 = V.getOperand(0);
15738 unsigned InBits = VOp0.getValueSizeInBits();
15739 unsigned Bits = V.getValueSizeInBits();
15740 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15741}
15742
15743SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15744 bool addTest = true;
15745 SDValue Cond = Op.getOperand(0);
15746 SDValue Op1 = Op.getOperand(1);
15747 SDValue Op2 = Op.getOperand(2);
15748 SDLoc DL(Op);
15749 EVT VT = Op1.getValueType();
15750 SDValue CC;
15751
15752 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15753 // are available. Otherwise fp cmovs get lowered into a less efficient branch
15754 // sequence later on.
15755 if (Cond.getOpcode() == ISD::SETCC &&
15756 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15757 (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15758 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15759 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15760 int SSECC = translateX86FSETCC(
15761 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15762
15763 if (SSECC != 8) {
15764 if (Subtarget->hasAVX512()) {
15765 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15766 DAG.getConstant(SSECC, MVT::i8));
15767 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15768 }
15769 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15770 DAG.getConstant(SSECC, MVT::i8));
15771 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15772 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15773 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15774 }
15775 }
15776
15777 if (Cond.getOpcode() == ISD::SETCC) {
15778 SDValue NewCond = LowerSETCC(Cond, DAG);
15779 if (NewCond.getNode())
15780 Cond = NewCond;
15781 }
15782
15783 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15784 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15785 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15786 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15787 if (Cond.getOpcode() == X86ISD::SETCC &&
15788 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15789 isZero(Cond.getOperand(1).getOperand(1))) {
15790 SDValue Cmp = Cond.getOperand(1);
15791
15792 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15793
15794 if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
15795 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15796 SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
15797
15798 SDValue CmpOp0 = Cmp.getOperand(0);
15799 // Apply further optimizations for special cases
15800 // (select (x != 0), -1, 0) -> neg & sbb
15801 // (select (x == 0), 0, -1) -> neg & sbb
15802 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
15803 if (YC->isNullValue() &&
15804 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
15805 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15806 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15807 DAG.getConstant(0, CmpOp0.getValueType()),
15808 CmpOp0);
15809 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15810 DAG.getConstant(X86::COND_B, MVT::i8),
15811 SDValue(Neg.getNode(), 1));
15812 return Res;
15813 }
15814
15815 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15816 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
15817 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15818
15819 SDValue Res = // Res = 0 or -1.
15820 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15821 DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
15822
15823 if (isAllOnes(Op1) != (CondCode == X86::COND_E))
15824 Res = DAG.getNOT(DL, Res, Res.getValueType());
15825
15826 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
15827 if (!N2C || !N2C->isNullValue())
15828 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15829 return Res;
15830 }
15831 }
15832
15833 // Look past (and (setcc_carry (cmp ...)), 1).
15834 if (Cond.getOpcode() == ISD::AND &&
15835 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
15836 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
15837 if (C && C->getAPIntValue() == 1)
15838 Cond = Cond.getOperand(0);
15839 }
15840
15841 // If condition flag is set by a X86ISD::CMP, then use it as the condition
15842 // setting operand in place of the X86ISD::SETCC.
15843 unsigned CondOpcode = Cond.getOpcode();
15844 if (CondOpcode == X86ISD::SETCC ||
15845 CondOpcode == X86ISD::SETCC_CARRY) {
15846 CC = Cond.getOperand(0);
15847
15848 SDValue Cmp = Cond.getOperand(1);
15849 unsigned Opc = Cmp.getOpcode();
15850 MVT VT = Op.getSimpleValueType();
15851
15852 bool IllegalFPCMov = false;
15853 if (VT.isFloatingPoint() && !VT.isVector() &&
15854 !isScalarFPTypeInSSEReg(VT)) // FPStack?
15855 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15856
15857 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15858 Opc == X86ISD::BT) { // FIXME
15859 Cond = Cmp;
15860 addTest = false;
15861 }
15862 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15863 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15864 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15865 Cond.getOperand(0).getValueType() != MVT::i8)) {
15866 SDValue LHS = Cond.getOperand(0);
15867 SDValue RHS = Cond.getOperand(1);
15868 unsigned X86Opcode;
15869 unsigned X86Cond;
15870 SDVTList VTs;
15871 switch (CondOpcode) {
15872 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15873 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15874 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15875 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15876 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15877 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15878 default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15878)
;
15879 }
15880 if (CondOpcode == ISD::UMULO)
15881 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15882 MVT::i32);
15883 else
15884 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15885
15886 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15887
15888 if (CondOpcode == ISD::UMULO)
15889 Cond = X86Op.getValue(2);
15890 else
15891 Cond = X86Op.getValue(1);
15892
15893 CC = DAG.getConstant(X86Cond, MVT::i8);
15894 addTest = false;
15895 }
15896
15897 if (addTest) {
15898 // Look pass the truncate if the high bits are known zero.
15899 if (isTruncWithZeroHighBitsInput(Cond, DAG))
15900 Cond = Cond.getOperand(0);
15901
15902 // We know the result of AND is compared against zero. Try to match
15903 // it to BT.
15904 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15905 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
15906 if (NewSetCC.getNode()) {
15907 CC = NewSetCC.getOperand(0);
15908 Cond = NewSetCC.getOperand(1);
15909 addTest = false;
15910 }
15911 }
15912 }
15913
15914 if (addTest) {
15915 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
15916 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15917 }
15918
15919 // a < b ? -1 : 0 -> RES = ~setcc_carry
15920 // a < b ? 0 : -1 -> RES = setcc_carry
15921 // a >= b ? -1 : 0 -> RES = setcc_carry
15922 // a >= b ? 0 : -1 -> RES = ~setcc_carry
15923 if (Cond.getOpcode() == X86ISD::SUB) {
15924 Cond = ConvertCmpIfNecessary(Cond, DAG);
15925 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15926
15927 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15928 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
15929 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15930 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
15931 if (isAllOnes(Op1) != (CondCode == X86::COND_B))
15932 return DAG.getNOT(DL, Res, Res.getValueType());
15933 return Res;
15934 }
15935 }
15936
15937 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15938 // widen the cmov and push the truncate through. This avoids introducing a new
15939 // branch during isel and doesn't add any extensions.
15940 if (Op.getValueType() == MVT::i8 &&
15941 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15942 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15943 if (T1.getValueType() == T2.getValueType() &&
15944 // Blacklist CopyFromReg to avoid partial register stalls.
15945 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15946 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15947 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15948 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15949 }
15950 }
15951
15952 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15953 // condition is true.
15954 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15955 SDValue Ops[] = { Op2, Op1, CC, Cond };
15956 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15957}
15958
15959static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
15960 SelectionDAG &DAG) {
15961 MVT VT = Op->getSimpleValueType(0);
15962 SDValue In = Op->getOperand(0);
15963 MVT InVT = In.getSimpleValueType();
15964 MVT VTElt = VT.getVectorElementType();
15965 MVT InVTElt = InVT.getVectorElementType();
15966 SDLoc dl(Op);
15967
15968 // SKX processor
15969 if ((InVTElt == MVT::i1) &&
15970 (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
15971 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
15972
15973 ((Subtarget->hasBWI() && VT.is512BitVector() &&
15974 VTElt.getSizeInBits() <= 16)) ||
15975
15976 ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
15977 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
15978
15979 ((Subtarget->hasDQI() && VT.is512BitVector() &&
15980 VTElt.getSizeInBits() >= 32))))
15981 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15982
15983 unsigned int NumElts = VT.getVectorNumElements();
15984
15985 if (NumElts != 8 && NumElts != 16)
15986 return SDValue();
15987
15988 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
15989 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
15990 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
15991 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15992 }
15993
15994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15995 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type")((InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 15995, __PRETTY_FUNCTION__))
;
15996
15997 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
15998 Constant *C = ConstantInt::get(*DAG.getContext(),
15999 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16000
16001 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16002 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16003 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16004 MachinePointerInfo::getConstantPool(),
16005 false, false, false, Alignment);
16006 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16007 if (VT.is512BitVector())
16008 return Brcst;
16009 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16010}
16011
16012static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16013 SelectionDAG &DAG) {
16014 MVT VT = Op->getSimpleValueType(0);
16015 SDValue In = Op->getOperand(0);
16016 MVT InVT = In.getSimpleValueType();
16017 SDLoc dl(Op);
16018
16019 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16020 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16021
16022 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16023 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16024 (VT != MVT::v16i16 || InVT != MVT::v16i8))
16025 return SDValue();
16026
16027 if (Subtarget->hasInt256())
16028 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16029
16030 // Optimize vectors in AVX mode
16031 // Sign extend v8i16 to v8i32 and
16032 // v4i32 to v4i64
16033 //
16034 // Divide input vector into two parts
16035 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16036 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16037 // concat the vectors to original VT
16038
16039 unsigned NumElems = InVT.getVectorNumElements();
16040 SDValue Undef = DAG.getUNDEF(InVT);
16041
16042 SmallVector<int,8> ShufMask1(NumElems, -1);
16043 for (unsigned i = 0; i != NumElems/2; ++i)
16044 ShufMask1[i] = i;
16045
16046 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16047
16048 SmallVector<int,8> ShufMask2(NumElems, -1);
16049 for (unsigned i = 0; i != NumElems/2; ++i)
16050 ShufMask2[i] = i + NumElems/2;
16051
16052 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16053
16054 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16055 VT.getVectorNumElements()/2);
16056
16057 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16058 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16059
16060 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16061}
16062
16063// Lower vector extended loads using a shuffle. If SSSE3 is not available we
16064// may emit an illegal shuffle but the expansion is still better than scalar
16065// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16066// we'll emit a shuffle and a arithmetic shift.
16067// TODO: It is possible to support ZExt by zeroing the undef values during
16068// the shuffle phase or after the shuffle.
16069static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16070 SelectionDAG &DAG) {
16071 MVT RegVT = Op.getSimpleValueType();
16072 assert(RegVT.isVector() && "We only custom lower vector sext loads.")((RegVT.isVector() && "We only custom lower vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16072, __PRETTY_FUNCTION__))
;
16073 assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16074, __PRETTY_FUNCTION__))
16074 "We only custom lower integer vector sext loads.")((RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16074, __PRETTY_FUNCTION__))
;
16075
16076 // Nothing useful we can do without SSE2 shuffles.
16077 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.")((Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"We only custom lower sext loads with SSE2.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16077, __PRETTY_FUNCTION__))
;
16078
16079 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16080 SDLoc dl(Ld);
16081 EVT MemVT = Ld->getMemoryVT();
16082 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16083 unsigned RegSz = RegVT.getSizeInBits();
16084
16085 ISD::LoadExtType Ext = Ld->getExtensionType();
16086
16087 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)(((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16088, __PRETTY_FUNCTION__))
16088 && "Only anyext and sext are currently implemented.")(((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? static_cast<void> (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16088, __PRETTY_FUNCTION__))
;
16089 assert(MemVT != RegVT && "Cannot extend to the same type")((MemVT != RegVT && "Cannot extend to the same type")
? static_cast<void> (0) : __assert_fail ("MemVT != RegVT && \"Cannot extend to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16089, __PRETTY_FUNCTION__))
;
16090 assert(MemVT.isVector() && "Must load a vector from memory")((MemVT.isVector() && "Must load a vector from memory"
) ? static_cast<void> (0) : __assert_fail ("MemVT.isVector() && \"Must load a vector from memory\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16090, __PRETTY_FUNCTION__))
;
16091
16092 unsigned NumElems = RegVT.getVectorNumElements();
16093 unsigned MemSz = MemVT.getSizeInBits();
16094 assert(RegSz > MemSz && "Register size must be greater than the mem size")((RegSz > MemSz && "Register size must be greater than the mem size"
) ? static_cast<void> (0) : __assert_fail ("RegSz > MemSz && \"Register size must be greater than the mem size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16094, __PRETTY_FUNCTION__))
;
16095
16096 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16097 // The only way in which we have a legal 256-bit vector result but not the
16098 // integer 256-bit operations needed to directly lower a sextload is if we
16099 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16100 // a 128-bit vector and a normal sign_extend to 256-bits that should get
16101 // correctly legalized. We do this late to allow the canonical form of
16102 // sextload to persist throughout the rest of the DAG combiner -- it wants
16103 // to fold together any extensions it can, and so will fuse a sign_extend
16104 // of an sextload into a sextload targeting a wider value.
16105 SDValue Load;
16106 if (MemSz == 128) {
16107 // Just switch this to a normal load.
16108 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))
16109 "it must be a legal 128-bit vector "((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))
16110 "type!")((TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? static_cast<
void> (0) : __assert_fail ("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))
;
16111 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16112 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16113 Ld->isInvariant(), Ld->getAlignment());
16114 } else {
16115 assert(MemSz < 128 &&((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16116, __PRETTY_FUNCTION__))
16116 "Can't extend a type wider than 128 bits to a 256 bit vector!")((MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? static_cast<void> (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16116, __PRETTY_FUNCTION__))
;
16117 // Do an sext load to a 128-bit vector type. We want to use the same
16118 // number of elements, but elements half as wide. This will end up being
16119 // recursively lowered by this routine, but will succeed as we definitely
16120 // have all the necessary features if we're using AVX1.
16121 EVT HalfEltVT =
16122 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16123 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16124 Load =
16125 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16126 Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16127 Ld->isNonTemporal(), Ld->isInvariant(),
16128 Ld->getAlignment());
16129 }
16130
16131 // Replace chain users with the new chain.
16132 assert(Load->getNumValues() == 2 && "Loads must carry a chain!")((Load->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16132, __PRETTY_FUNCTION__))
;
16133 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16134
16135 // Finally, do a normal sign-extend to the desired register.
16136 return DAG.getSExtOrTrunc(Load, dl, RegVT);
16137 }
16138
16139 // All sizes must be a power of two.
16140 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16141, __PRETTY_FUNCTION__))
16141 "Non-power-of-two elements are not custom lowered!")((isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16141, __PRETTY_FUNCTION__))
;
16142
16143 // Attempt to load the original value using scalar loads.
16144 // Find the largest scalar type that divides the total loaded size.
16145 MVT SclrLoadTy = MVT::i8;
16146 for (MVT Tp : MVT::integer_valuetypes()) {
16147 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16148 SclrLoadTy = Tp;
16149 }
16150 }
16151
16152 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16153 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16154 (64 <= MemSz))
16155 SclrLoadTy = MVT::f64;
16156
16157 // Calculate the number of scalar loads that we need to perform
16158 // in order to load our vector from memory.
16159 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16160
16161 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&(((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"
) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16162, __PRETTY_FUNCTION__))
16162 "Can only lower sext loads with a single scalar load!")(((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"
) ? static_cast<void> (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16162, __PRETTY_FUNCTION__))
;
16163
16164 unsigned loadRegZize = RegSz;
16165 if (Ext == ISD::SEXTLOAD && RegSz == 256)
16166 loadRegZize /= 2;
16167
16168 // Represent our vector as a sequence of elements which are the
16169 // largest scalar that we can load.
16170 EVT LoadUnitVecVT = EVT::getVectorVT(
16171 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16172
16173 // Represent the data using the same element type that is stored in
16174 // memory. In practice, we ''widen'' MemVT.
16175 EVT WideVecVT =
16176 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16177 loadRegZize / MemVT.getScalarType().getSizeInBits());
16178
16179 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type") ? static_cast<void> (0) : __assert_fail
("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16180, __PRETTY_FUNCTION__))
16180 "Invalid vector type")((WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type") ? static_cast<void> (0) : __assert_fail
("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16180, __PRETTY_FUNCTION__))
;
16181
16182 // We can't shuffle using an illegal type.
16183 assert(TLI.isTypeLegal(WideVecVT) &&((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"
) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16184, __PRETTY_FUNCTION__))
16184 "We only lower types that form legal widened vector types")((TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"
) ? static_cast<void> (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16184, __PRETTY_FUNCTION__))
;
16185
16186 SmallVector<SDValue, 8> Chains;
16187 SDValue Ptr = Ld->getBasePtr();
16188 SDValue Increment =
16189 DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16190 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16191
16192 for (unsigned i = 0; i < NumLoads; ++i) {
16193 // Perform a single load.
16194 SDValue ScalarLoad =
16195 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16196 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16197 Ld->getAlignment());
16198 Chains.push_back(ScalarLoad.getValue(1));
16199 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16200 // another round of DAGCombining.
16201 if (i == 0)
16202 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16203 else
16204 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16205 ScalarLoad, DAG.getIntPtrConstant(i));
16206
16207 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16208 }
16209
16210 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16211
16212 // Bitcast the loaded value to a vector of the original element type, in
16213 // the size of the target vector type.
16214 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16215 unsigned SizeRatio = RegSz / MemSz;
16216
16217 if (Ext == ISD::SEXTLOAD) {
16218 // If we have SSE4.1, we can directly emit a VSEXT node.
16219 if (Subtarget->hasSSE41()) {
16220 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16221 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16222 return Sext;
16223 }
16224
16225 // Otherwise we'll shuffle the small elements in the high bits of the
16226 // larger type and perform an arithmetic shift. If the shift is not legal
16227 // it's better to scalarize.
16228 assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&((TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && "We can't implement a sext load without an arithmetic right shift!"
) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && \"We can't implement a sext load without an arithmetic right shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16229, __PRETTY_FUNCTION__))
16229 "We can't implement a sext load without an arithmetic right shift!")((TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && "We can't implement a sext load without an arithmetic right shift!"
) ? static_cast<void> (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && \"We can't implement a sext load without an arithmetic right shift!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16229, __PRETTY_FUNCTION__))
;
16230
16231 // Redistribute the loaded elements into the different locations.
16232 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16233 for (unsigned i = 0; i != NumElems; ++i)
16234 ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16235
16236 SDValue Shuff = DAG.getVectorShuffle(
16237 WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16238
16239 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16240
16241 // Build the arithmetic shift.
16242 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16243 MemVT.getVectorElementType().getSizeInBits();
16244 Shuff =
16245 DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16246
16247 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16248 return Shuff;
16249 }
16250
16251 // Redistribute the loaded elements into the different locations.
16252 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16253 for (unsigned i = 0; i != NumElems; ++i)
16254 ShuffleVec[i * SizeRatio] = i;
16255
16256 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16257 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16258
16259 // Bitcast to the requested type.
16260 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16261 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16262 return Shuff;
16263}
16264
16265// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16266// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16267// from the AND / OR.
16268static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16269 Opc = Op.getOpcode();
16270 if (Opc != ISD::OR && Opc != ISD::AND)
16271 return false;
16272 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16273 Op.getOperand(0).hasOneUse() &&
16274 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16275 Op.getOperand(1).hasOneUse());
16276}
16277
16278// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16279// 1 and that the SETCC node has a single use.
16280static bool isXor1OfSetCC(SDValue Op) {
16281 if (Op.getOpcode() != ISD::XOR)
16282 return false;
16283 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16284 if (N1C && N1C->getAPIntValue() == 1) {
16285 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16286 Op.getOperand(0).hasOneUse();
16287 }
16288 return false;
16289}
16290
16291SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16292 bool addTest = true;
16293 SDValue Chain = Op.getOperand(0);
16294 SDValue Cond = Op.getOperand(1);
16295 SDValue Dest = Op.getOperand(2);
16296 SDLoc dl(Op);
16297 SDValue CC;
16298 bool Inverted = false;
16299
16300 if (Cond.getOpcode() == ISD::SETCC) {
16301 // Check for setcc([su]{add,sub,mul}o == 0).
16302 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16303 isa<ConstantSDNode>(Cond.getOperand(1)) &&
16304 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16305 Cond.getOperand(0).getResNo() == 1 &&
16306 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16307 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16308 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16309 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16310 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16311 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16312 Inverted = true;
16313 Cond = Cond.getOperand(0);
16314 } else {
16315 SDValue NewCond = LowerSETCC(Cond, DAG);
16316 if (NewCond.getNode())
16317 Cond = NewCond;
16318 }
16319 }
16320#if 0
16321 // FIXME: LowerXALUO doesn't handle these!!
16322 else if (Cond.getOpcode() == X86ISD::ADD ||
16323 Cond.getOpcode() == X86ISD::SUB ||
16324 Cond.getOpcode() == X86ISD::SMUL ||
16325 Cond.getOpcode() == X86ISD::UMUL)
16326 Cond = LowerXALUO(Cond, DAG);
16327#endif
16328
16329 // Look pass (and (setcc_carry (cmp ...)), 1).
16330 if (Cond.getOpcode() == ISD::AND &&
16331 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16332 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16333 if (C && C->getAPIntValue() == 1)
16334 Cond = Cond.getOperand(0);
16335 }
16336
16337 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16338 // setting operand in place of the X86ISD::SETCC.
16339 unsigned CondOpcode = Cond.getOpcode();
16340 if (CondOpcode == X86ISD::SETCC ||
16341 CondOpcode == X86ISD::SETCC_CARRY) {
16342 CC = Cond.getOperand(0);
16343
16344 SDValue Cmp = Cond.getOperand(1);
16345 unsigned Opc = Cmp.getOpcode();
16346 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16347 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16348 Cond = Cmp;
16349 addTest = false;
16350 } else {
16351 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16352 default: break;
16353 case X86::COND_O:
16354 case X86::COND_B:
16355 // These can only come from an arithmetic instruction with overflow,
16356 // e.g. SADDO, UADDO.
16357 Cond = Cond.getNode()->getOperand(1);
16358 addTest = false;
16359 break;
16360 }
16361 }
16362 }
16363 CondOpcode = Cond.getOpcode();
16364 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16365 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16366 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16367 Cond.getOperand(0).getValueType() != MVT::i8)) {
16368 SDValue LHS = Cond.getOperand(0);
16369 SDValue RHS = Cond.getOperand(1);
16370 unsigned X86Opcode;
16371 unsigned X86Cond;
16372 SDVTList VTs;
16373 // Keep this in sync with LowerXALUO, otherwise we might create redundant
16374 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16375 // X86ISD::INC).
16376 switch (CondOpcode) {
16377 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16378 case ISD::SADDO:
16379 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16380 if (C->isOne()) {
16381 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16382 break;
16383 }
16384 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16385 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16386 case ISD::SSUBO:
16387 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16388 if (C->isOne()) {
16389 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16390 break;
16391 }
16392 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16393 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16394 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16395 default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16395)
;
16396 }
16397 if (Inverted)
16398 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16399 if (CondOpcode == ISD::UMULO)
16400 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16401 MVT::i32);
16402 else
16403 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16404
16405 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16406
16407 if (CondOpcode == ISD::UMULO)
16408 Cond = X86Op.getValue(2);
16409 else
16410 Cond = X86Op.getValue(1);
16411
16412 CC = DAG.getConstant(X86Cond, MVT::i8);
16413 addTest = false;
16414 } else {
16415 unsigned CondOpc;
16416 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16417 SDValue Cmp = Cond.getOperand(0).getOperand(1);
16418 if (CondOpc == ISD::OR) {
16419 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16420 // two branches instead of an explicit OR instruction with a
16421 // separate test.
16422 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16423 isX86LogicalCmp(Cmp)) {
16424 CC = Cond.getOperand(0).getOperand(0);
16425 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16426 Chain, Dest, CC, Cmp);
16427 CC = Cond.getOperand(1).getOperand(0);
16428 Cond = Cmp;
16429 addTest = false;
16430 }
16431 } else { // ISD::AND
16432 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16433 // two branches instead of an explicit AND instruction with a
16434 // separate test. However, we only do this if this block doesn't
16435 // have a fall-through edge, because this requires an explicit
16436 // jmp when the condition is false.
16437 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16438 isX86LogicalCmp(Cmp) &&
16439 Op.getNode()->hasOneUse()) {
16440 X86::CondCode CCode =
16441 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16442 CCode = X86::GetOppositeBranchCondition(CCode);
16443 CC = DAG.getConstant(CCode, MVT::i8);
16444 SDNode *User = *Op.getNode()->use_begin();
16445 // Look for an unconditional branch following this conditional branch.
16446 // We need this because we need to reverse the successors in order
16447 // to implement FCMP_OEQ.
16448 if (User->getOpcode() == ISD::BR) {
16449 SDValue FalseBB = User->getOperand(1);
16450 SDNode *NewBR =
16451 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16452 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16452, __PRETTY_FUNCTION__))
;
16453 (void)NewBR;
16454 Dest = FalseBB;
16455
16456 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16457 Chain, Dest, CC, Cmp);
16458 X86::CondCode CCode =
16459 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16460 CCode = X86::GetOppositeBranchCondition(CCode);
16461 CC = DAG.getConstant(CCode, MVT::i8);
16462 Cond = Cmp;
16463 addTest = false;
16464 }
16465 }
16466 }
16467 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16468 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16469 // It should be transformed during dag combiner except when the condition
16470 // is set by a arithmetics with overflow node.
16471 X86::CondCode CCode =
16472 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16473 CCode = X86::GetOppositeBranchCondition(CCode);
16474 CC = DAG.getConstant(CCode, MVT::i8);
16475 Cond = Cond.getOperand(0).getOperand(1);
16476 addTest = false;
16477 } else if (Cond.getOpcode() == ISD::SETCC &&
16478 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16479 // For FCMP_OEQ, we can emit
16480 // two branches instead of an explicit AND instruction with a
16481 // separate test. However, we only do this if this block doesn't
16482 // have a fall-through edge, because this requires an explicit
16483 // jmp when the condition is false.
16484 if (Op.getNode()->hasOneUse()) {
16485 SDNode *User = *Op.getNode()->use_begin();
16486 // Look for an unconditional branch following this conditional branch.
16487 // We need this because we need to reverse the successors in order
16488 // to implement FCMP_OEQ.
16489 if (User->getOpcode() == ISD::BR) {
16490 SDValue FalseBB = User->getOperand(1);
16491 SDNode *NewBR =
16492 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16493 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16493, __PRETTY_FUNCTION__))
;
16494 (void)NewBR;
16495 Dest = FalseBB;
16496
16497 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16498 Cond.getOperand(0), Cond.getOperand(1));
16499 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16500 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16501 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16502 Chain, Dest, CC, Cmp);
16503 CC = DAG.getConstant(X86::COND_P, MVT::i8);
16504 Cond = Cmp;
16505 addTest = false;
16506 }
16507 }
16508 } else if (Cond.getOpcode() == ISD::SETCC &&
16509 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16510 // For FCMP_UNE, we can emit
16511 // two branches instead of an explicit AND instruction with a
16512 // separate test. However, we only do this if this block doesn't
16513 // have a fall-through edge, because this requires an explicit
16514 // jmp when the condition is false.
16515 if (Op.getNode()->hasOneUse()) {
16516 SDNode *User = *Op.getNode()->use_begin();
16517 // Look for an unconditional branch following this conditional branch.
16518 // We need this because we need to reverse the successors in order
16519 // to implement FCMP_UNE.
16520 if (User->getOpcode() == ISD::BR) {
16521 SDValue FalseBB = User->getOperand(1);
16522 SDNode *NewBR =
16523 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16524 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16524, __PRETTY_FUNCTION__))
;
16525 (void)NewBR;
16526
16527 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16528 Cond.getOperand(0), Cond.getOperand(1));
16529 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16530 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16531 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16532 Chain, Dest, CC, Cmp);
16533 CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16534 Cond = Cmp;
16535 addTest = false;
16536 Dest = FalseBB;
16537 }
16538 }
16539 }
16540 }
16541
16542 if (addTest) {
16543 // Look pass the truncate if the high bits are known zero.
16544 if (isTruncWithZeroHighBitsInput(Cond, DAG))
16545 Cond = Cond.getOperand(0);
16546
16547 // We know the result of AND is compared against zero. Try to match
16548 // it to BT.
16549 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16550 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16551 if (NewSetCC.getNode()) {
16552 CC = NewSetCC.getOperand(0);
16553 Cond = NewSetCC.getOperand(1);
16554 addTest = false;
16555 }
16556 }
16557 }
16558
16559 if (addTest) {
16560 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16561 CC = DAG.getConstant(X86Cond, MVT::i8);
16562 Cond = EmitTest(Cond, X86Cond, dl, DAG);
16563 }
16564 Cond = ConvertCmpIfNecessary(Cond, DAG);
16565 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16566 Chain, Dest, CC, Cond);
16567}
16568
16569// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16570// Calls to _alloca are needed to probe the stack when allocating more than 4k
16571// bytes in one go. Touching the stack at 4K increments is necessary to ensure
16572// that the guard pages used by the OS virtual memory manager are allocated in
16573// correct sequence.
16574SDValue
16575X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16576 SelectionDAG &DAG) const {
16577 MachineFunction &MF = DAG.getMachineFunction();
16578 bool SplitStack = MF.shouldSplitStack();
16579 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16580 SplitStack;
16581 SDLoc dl(Op);
16582
16583 if (!Lower) {
16584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16585 SDNode* Node = Op.getNode();
16586
16587 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16588 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16589, __PRETTY_FUNCTION__))
16589 " not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16589, __PRETTY_FUNCTION__))
;
16590 EVT VT = Node->getValueType(0);
16591 SDValue Tmp1 = SDValue(Node, 0);
16592 SDValue Tmp2 = SDValue(Node, 1);
16593 SDValue Tmp3 = Node->getOperand(2);
16594 SDValue Chain = Tmp1.getOperand(0);
16595
16596 // Chain the dynamic stack allocation so that it doesn't modify the stack
16597 // pointer when other instructions are using the stack.
16598 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16599 SDLoc(Node));
16600
16601 SDValue Size = Tmp2.getOperand(1);
16602 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16603 Chain = SP.getValue(1);
16604 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16605 const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
16606 unsigned StackAlign = TFI.getStackAlignment();
16607 Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16608 if (Align > StackAlign)
16609 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16610 DAG.getConstant(-(uint64_t)Align, VT));
16611 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16612
16613 Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16614 DAG.getIntPtrConstant(0, true), SDValue(),
16615 SDLoc(Node));
16616
16617 SDValue Ops[2] = { Tmp1, Tmp2 };
16618 return DAG.getMergeValues(Ops, dl);
16619 }
16620
16621 // Get the inputs.
16622 SDValue Chain = Op.getOperand(0);
16623 SDValue Size = Op.getOperand(1);
16624 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16625 EVT VT = Op.getNode()->getValueType(0);
16626
16627 bool Is64Bit = Subtarget->is64Bit();
16628 EVT SPTy = getPointerTy();
16629
16630 if (SplitStack) {
16631 MachineRegisterInfo &MRI = MF.getRegInfo();
16632
16633 if (Is64Bit) {
16634 // The 64 bit implementation of segmented stacks needs to clobber both r10
16635 // r11. This makes it impossible to use it along with nested parameters.
16636 const Function *F = MF.getFunction();
16637
16638 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16639 I != E; ++I)
16640 if (I->hasNestAttr())
16641 report_fatal_error("Cannot use segmented stacks with functions that "
16642 "have nested arguments.");
16643 }
16644
16645 const TargetRegisterClass *AddrRegClass =
16646 getRegClassFor(getPointerTy());
16647 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16648 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16649 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16650 DAG.getRegister(Vreg, SPTy));
16651 SDValue Ops1[2] = { Value, Chain };
16652 return DAG.getMergeValues(Ops1, dl);
16653 } else {
16654 SDValue Flag;
16655 const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16656
16657 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16658 Flag = Chain.getValue(1);
16659 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16660
16661 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16662
16663 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
16664 DAG.getSubtarget().getRegisterInfo());
16665 unsigned SPReg = RegInfo->getStackRegister();
16666 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16667 Chain = SP.getValue(1);
16668
16669 if (Align) {
16670 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16671 DAG.getConstant(-(uint64_t)Align, VT));
16672 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16673 }
16674
16675 SDValue Ops1[2] = { SP, Chain };
16676 return DAG.getMergeValues(Ops1, dl);
16677 }
16678}
16679
16680SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16681 MachineFunction &MF = DAG.getMachineFunction();
16682 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16683
16684 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16685 SDLoc DL(Op);
16686
16687 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16688 // vastart just stores the address of the VarArgsFrameIndex slot into the
16689 // memory location argument.
16690 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16691 getPointerTy());
16692 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16693 MachinePointerInfo(SV), false, false, 0);
16694 }
16695
16696 // __va_list_tag:
16697 // gp_offset (0 - 6 * 8)
16698 // fp_offset (48 - 48 + 8 * 16)
16699 // overflow_arg_area (point to parameters coming in memory).
16700 // reg_save_area
16701 SmallVector<SDValue, 8> MemOps;
16702 SDValue FIN = Op.getOperand(1);
16703 // Store gp_offset
16704 SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16705 DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16706 MVT::i32),
16707 FIN, MachinePointerInfo(SV), false, false, 0);
16708 MemOps.push_back(Store);
16709
16710 // Store fp_offset
16711 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16712 FIN, DAG.getIntPtrConstant(4));
16713 Store = DAG.getStore(Op.getOperand(0), DL,
16714 DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16715 MVT::i32),
16716 FIN, MachinePointerInfo(SV, 4), false, false, 0);
16717 MemOps.push_back(Store);
16718
16719 // Store ptr to overflow_arg_area
16720 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16721 FIN, DAG.getIntPtrConstant(4));
16722 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16723 getPointerTy());
16724 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16725 MachinePointerInfo(SV, 8),
16726 false, false, 0);
16727 MemOps.push_back(Store);
16728
16729 // Store ptr to reg_save_area.
16730 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16731 FIN, DAG.getIntPtrConstant(8));
16732 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16733 getPointerTy());
16734 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16735 MachinePointerInfo(SV, 16), false, false, 0);
16736 MemOps.push_back(Store);
16737 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16738}
16739
16740SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16741 assert(Subtarget->is64Bit() &&((Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16742, __PRETTY_FUNCTION__))
16742 "LowerVAARG only handles 64-bit va_arg!")((Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16742, __PRETTY_FUNCTION__))
;
16743 assert((Subtarget->isTargetLinux() ||(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__))
16744 Subtarget->isTargetDarwin()) &&(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__))
16745 "Unhandled target in LowerVAARG")(((Subtarget->isTargetLinux() || Subtarget->isTargetDarwin
()) && "Unhandled target in LowerVAARG") ? static_cast
<void> (0) : __assert_fail ("(Subtarget->isTargetLinux() || Subtarget->isTargetDarwin()) && \"Unhandled target in LowerVAARG\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16745, __PRETTY_FUNCTION__))
;
16746 assert(Op.getNode()->getNumOperands() == 4)((Op.getNode()->getNumOperands() == 4) ? static_cast<void
> (0) : __assert_fail ("Op.getNode()->getNumOperands() == 4"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16746, __PRETTY_FUNCTION__))
;
16747 SDValue Chain = Op.getOperand(0);
16748 SDValue SrcPtr = Op.getOperand(1);
16749 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16750 unsigned Align = Op.getConstantOperandVal(3);
16751 SDLoc dl(Op);
16752
16753 EVT ArgVT = Op.getNode()->getValueType(0);
16754 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16755 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16756 uint8_t ArgMode;
16757
16758 // Decide which area this value should be read from.
16759 // TODO: Implement the AMD64 ABI in its entirety. This simple
16760 // selection mechanism works only for the basic types.
16761 if (ArgVT == MVT::f80) {
16762 llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16762)
;
16763 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16764 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
16765 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16766 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
16767 } else {
16768 llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16768)
;
16769 }
16770
16771 if (ArgMode == 2) {
16772 // Sanity Check: Make sure using fp_offset makes sense.
16773 assert(!DAG.getTarget().Options.UseSoftFloat &&((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
16774 !(DAG.getMachineFunction()((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
16775 .getFunction()->getAttributes()((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
16776 .hasAttribute(AttributeSet::FunctionIndex,((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
16777 Attribute::NoImplicitFloat)) &&((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
16778 Subtarget->hasSSE1())((!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction
() .getFunction()->getAttributes() .hasAttribute(AttributeSet
::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget
->hasSSE1()) ? static_cast<void> (0) : __assert_fail
("!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16778, __PRETTY_FUNCTION__))
;
16779 }
16780
16781 // Insert VAARG_64 node into the DAG
16782 // VAARG_64 returns two values: Variable Argument Address, Chain
16783 SmallVector<SDValue, 11> InstOps;
16784 InstOps.push_back(Chain);
16785 InstOps.push_back(SrcPtr);
16786 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
16787 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
16788 InstOps.push_back(DAG.getConstant(Align, MVT::i32));
16789 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
16790 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16791 VTs, InstOps, MVT::i64,
16792 MachinePointerInfo(SV),
16793 /*Align=*/0,
16794 /*Volatile=*/false,
16795 /*ReadMem=*/true,
16796 /*WriteMem=*/true);
16797 Chain = VAARG.getValue(1);
16798
16799 // Load the next argument and return it
16800 return DAG.getLoad(ArgVT, dl,
16801 Chain,
16802 VAARG,
16803 MachinePointerInfo(),
16804 false, false, false, 0);
16805}
16806
16807static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
16808 SelectionDAG &DAG) {
16809 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
16810 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16810, __PRETTY_FUNCTION__))
;
16811 SDValue Chain = Op.getOperand(0);
16812 SDValue DstPtr = Op.getOperand(1);
16813 SDValue SrcPtr = Op.getOperand(2);
16814 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
16815 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16816 SDLoc DL(Op);
16817
16818 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
16819 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
16820 false,
16821 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
16822}
16823
16824// getTargetVShiftByConstNode - Handle vector element shifts where the shift
16825// amount is a constant. Takes immediate version of shift as input.
16826static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
16827 SDValue SrcOp, uint64_t ShiftAmt,
16828 SelectionDAG &DAG) {
16829 MVT ElementType = VT.getVectorElementType();
16830
16831 // Fold this packed shift into its first operand if ShiftAmt is 0.
16832 if (ShiftAmt == 0)
16833 return SrcOp;
16834
16835 // Check for ShiftAmt >= element width
16836 if (ShiftAmt >= ElementType.getSizeInBits()) {
16837 if (Opc == X86ISD::VSRAI)
16838 ShiftAmt = ElementType.getSizeInBits() - 1;
16839 else
16840 return DAG.getConstant(0, VT);
16841 }
16842
16843 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16844, __PRETTY_FUNCTION__))
16844 && "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16844, __PRETTY_FUNCTION__))
;
16845
16846 // Fold this packed vector shift into a build vector if SrcOp is a
16847 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
16848 if (VT == SrcOp.getSimpleValueType() &&
16849 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
16850 SmallVector<SDValue, 8> Elts;
16851 unsigned NumElts = SrcOp->getNumOperands();
16852 ConstantSDNode *ND;
16853
16854 switch(Opc) {
16855 default: llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16855)
;
16856 case X86ISD::VSHLI:
16857 for (unsigned i=0; i!=NumElts; ++i) {
16858 SDValue CurrentOp = SrcOp->getOperand(i);
16859 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16860 Elts.push_back(CurrentOp);
16861 continue;
16862 }
16863 ND = cast<ConstantSDNode>(CurrentOp);
16864 const APInt &C = ND->getAPIntValue();
16865 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
16866 }
16867 break;
16868 case X86ISD::VSRLI:
16869 for (unsigned i=0; i!=NumElts; ++i) {
16870 SDValue CurrentOp = SrcOp->getOperand(i);
16871 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16872 Elts.push_back(CurrentOp);
16873 continue;
16874 }
16875 ND = cast<ConstantSDNode>(CurrentOp);
16876 const APInt &C = ND->getAPIntValue();
16877 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
16878 }
16879 break;
16880 case X86ISD::VSRAI:
16881 for (unsigned i=0; i!=NumElts; ++i) {
16882 SDValue CurrentOp = SrcOp->getOperand(i);
16883 if (CurrentOp->getOpcode() == ISD::UNDEF) {
16884 Elts.push_back(CurrentOp);
16885 continue;
16886 }
16887 ND = cast<ConstantSDNode>(CurrentOp);
16888 const APInt &C = ND->getAPIntValue();
16889 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
16890 }
16891 break;
16892 }
16893
16894 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16895 }
16896
16897 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
16898}
16899
16900// getTargetVShiftNode - Handle vector element shifts where the shift amount
16901// may or may not be a constant. Takes immediate version of shift as input.
16902static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
16903 SDValue SrcOp, SDValue ShAmt,
16904 SelectionDAG &DAG) {
16905 MVT SVT = ShAmt.getSimpleValueType();
16906 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16906, __PRETTY_FUNCTION__))
;
16907
16908 // Catch shift-by-constant.
16909 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
16910 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
16911 CShAmt->getZExtValue(), DAG);
16912
16913 // Change opcode to non-immediate version
16914 switch (Opc) {
16915 default: llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16915)
;
16916 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
16917 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
16918 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
16919 }
16920
16921 const X86Subtarget &Subtarget =
16922 DAG.getTarget().getSubtarget<X86Subtarget>();
16923 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
16924 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
16925 // Let the shuffle legalizer expand this shift amount node.
16926 SDValue Op0 = ShAmt.getOperand(0);
16927 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
16928 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
16929 } else {
16930 // Need to build a vector containing shift amount.
16931 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
16932 SmallVector<SDValue, 4> ShOps;
16933 ShOps.push_back(ShAmt);
16934 if (SVT == MVT::i32) {
16935 ShOps.push_back(DAG.getConstant(0, SVT));
16936 ShOps.push_back(DAG.getUNDEF(SVT));
16937 }
16938 ShOps.push_back(DAG.getUNDEF(SVT));
16939
16940 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
16941 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
16942 }
16943
16944 // The return type has to be a 128-bit type with the same element
16945 // type as the input type.
16946 MVT EltVT = VT.getVectorElementType();
16947 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
16948
16949 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
16950 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
16951}
16952
16953/// \brief Return (and \p Op, \p Mask) for compare instructions or
16954/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
16955/// necessary casting for \p Mask when lowering masking intrinsics.
16956static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
16957 SDValue PreservedSrc,
16958 const X86Subtarget *Subtarget,
16959 SelectionDAG &DAG) {
16960 EVT VT = Op.getValueType();
16961 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
16962 MVT::i1, VT.getVectorNumElements());
16963 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
16964 Mask.getValueType().getSizeInBits());
16965 SDLoc dl(Op);
16966
16967 assert(MaskVT.isSimple() && "invalid mask type")((MaskVT.isSimple() && "invalid mask type") ? static_cast
<void> (0) : __assert_fail ("MaskVT.isSimple() && \"invalid mask type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 16967, __PRETTY_FUNCTION__))
;
16968
16969 if (isAllOnes(Mask))
16970 return Op;
16971
16972 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
16973 // are extracted by EXTRACT_SUBVECTOR.
16974 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
16975 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
16976 DAG.getIntPtrConstant(0));
16977
16978 switch (Op.getOpcode()) {
16979 default: break;
16980 case X86ISD::PCMPEQM:
16981 case X86ISD::PCMPGTM:
16982 case X86ISD::CMPM:
16983 case X86ISD::CMPMU:
16984 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
16985 }
16986 if (PreservedSrc.getOpcode() == ISD::UNDEF)
16987 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16988 return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
16989}
16990
16991/// \brief Creates an SDNode for a predicated scalar operation.
16992/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
16993/// The mask is comming as MVT::i8 and it should be truncated
16994/// to MVT::i1 while lowering masking intrinsics.
16995/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
16996/// "X86select" instead of "vselect". We just can't create the "vselect" node for
16997/// a scalar instruction.
16998static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
16999 SDValue PreservedSrc,
17000 const X86Subtarget *Subtarget,
17001 SelectionDAG &DAG) {
17002 if (isAllOnes(Mask))
17003 return Op;
17004
17005 EVT VT = Op.getValueType();
17006 SDLoc dl(Op);
17007 // The mask should be of type MVT::i1
17008 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17009
17010 if (PreservedSrc.getOpcode() == ISD::UNDEF)
17011 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17012 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17013}
17014
17015static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17016 SelectionDAG &DAG) {
17017 SDLoc dl(Op);
17018 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17019 EVT VT = Op.getValueType();
17020 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17021 if (IntrData) {
17022 switch(IntrData->Type) {
17023 case INTR_TYPE_1OP:
17024 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17025 case INTR_TYPE_2OP:
17026 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17027 Op.getOperand(2));
17028 case INTR_TYPE_3OP:
17029 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17030 Op.getOperand(2), Op.getOperand(3));
17031 case INTR_TYPE_1OP_MASK_RM: {
17032 SDValue Src = Op.getOperand(1);
17033 SDValue Src0 = Op.getOperand(2);
17034 SDValue Mask = Op.getOperand(3);
17035 SDValue RoundingMode = Op.getOperand(4);
17036 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17037 RoundingMode),
17038 Mask, Src0, Subtarget, DAG);
17039 }
17040 case INTR_TYPE_SCALAR_MASK_RM: {
17041 SDValue Src1 = Op.getOperand(1);
17042 SDValue Src2 = Op.getOperand(2);
17043 SDValue Src0 = Op.getOperand(3);
17044 SDValue Mask = Op.getOperand(4);
17045 SDValue RoundingMode = Op.getOperand(5);
17046 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17047 RoundingMode),
17048 Mask, Src0, Subtarget, DAG);
17049 }
17050 case INTR_TYPE_2OP_MASK: {
17051 SDValue Mask = Op.getOperand(4);
17052 SDValue PassThru = Op.getOperand(3);
17053 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17054 if (IntrWithRoundingModeOpcode != 0) {
17055 unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17056 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17057 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17058 dl, Op.getValueType(),
17059 Op.getOperand(1), Op.getOperand(2),
17060 Op.getOperand(3), Op.getOperand(5)),
17061 Mask, PassThru, Subtarget, DAG);
17062 }
17063 }
17064 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17065 Op.getOperand(1),
17066 Op.getOperand(2)),
17067 Mask, PassThru, Subtarget, DAG);
17068 }
17069 case FMA_OP_MASK: {
17070 SDValue Src1 = Op.getOperand(1);
17071 SDValue Src2 = Op.getOperand(2);
17072 SDValue Src3 = Op.getOperand(3);
17073 SDValue Mask = Op.getOperand(4);
17074 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17075 if (IntrWithRoundingModeOpcode != 0) {
17076 SDValue Rnd = Op.getOperand(5);
17077 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17078 X86::STATIC_ROUNDING::CUR_DIRECTION)
17079 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17080 dl, Op.getValueType(),
17081 Src1, Src2, Src3, Rnd),
17082 Mask, Src1, Subtarget, DAG);
17083 }
17084 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17085 dl, Op.getValueType(),
17086 Src1, Src2, Src3),
17087 Mask, Src1, Subtarget, DAG);
17088 }
17089 case CMP_MASK:
17090 case CMP_MASK_CC: {
17091 // Comparison intrinsics with masks.
17092 // Example of transformation:
17093 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17094 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17095 // (i8 (bitcast
17096 // (v8i1 (insert_subvector undef,
17097 // (v2i1 (and (PCMPEQM %a, %b),
17098 // (extract_subvector
17099 // (v8i1 (bitcast %mask)), 0))), 0))))
17100 EVT VT = Op.getOperand(1).getValueType();
17101 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17102 VT.getVectorNumElements());
17103 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17104 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17105 Mask.getValueType().getSizeInBits());
17106 SDValue Cmp;
17107 if (IntrData->Type == CMP_MASK_CC) {
17108 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17109 Op.getOperand(2), Op.getOperand(3));
17110 } else {
17111 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!")((IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Type == CMP_MASK && \"Unexpected intrinsic type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17111, __PRETTY_FUNCTION__))
;
17112 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17113 Op.getOperand(2));
17114 }
17115 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17116 DAG.getTargetConstant(0, MaskVT),
17117 Subtarget, DAG);
17118 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17119 DAG.getUNDEF(BitcastVT), CmpMask,
17120 DAG.getIntPtrConstant(0));
17121 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17122 }
17123 case COMI: { // Comparison intrinsics
17124 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17125 SDValue LHS = Op.getOperand(1);
17126 SDValue RHS = Op.getOperand(2);
17127 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17128 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!")((X86CC != X86::COND_INVALID && "Unexpected illegal condition!"
) ? static_cast<void> (0) : __assert_fail ("X86CC != X86::COND_INVALID && \"Unexpected illegal condition!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17128, __PRETTY_FUNCTION__))
;
17129 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17130 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17131 DAG.getConstant(X86CC, MVT::i8), Cond);
17132 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17133 }
17134 case VSHIFT:
17135 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17136 Op.getOperand(1), Op.getOperand(2), DAG);
17137 case VSHIFT_MASK:
17138 return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17139 Op.getSimpleValueType(),
17140 Op.getOperand(1),
17141 Op.getOperand(2), DAG),
17142 Op.getOperand(4), Op.getOperand(3), Subtarget,
17143 DAG);
17144 case COMPRESS_EXPAND_IN_REG: {
17145 SDValue Mask = Op.getOperand(3);
17146 SDValue DataToCompress = Op.getOperand(1);
17147 SDValue PassThru = Op.getOperand(2);
17148 if (isAllOnes(Mask)) // return data as is
17149 return Op.getOperand(1);
17150 EVT VT = Op.getValueType();
17151 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17152 VT.getVectorNumElements());
17153 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17154 Mask.getValueType().getSizeInBits());
17155 SDLoc dl(Op);
17156 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17157 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17158 DAG.getIntPtrConstant(0));
17159
17160 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17161 PassThru);
17162 }
17163 case BLEND: {
17164 SDValue Mask = Op.getOperand(3);
17165 EVT VT = Op.getValueType();
17166 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17167 VT.getVectorNumElements());
17168 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17169 Mask.getValueType().getSizeInBits());
17170 SDLoc dl(Op);
17171 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17172 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17173 DAG.getIntPtrConstant(0));
17174 return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17175 Op.getOperand(2));
17176 }
17177 default:
17178 break;
17179 }
17180 }
17181
17182 switch (IntNo) {
17183 default: return SDValue(); // Don't custom lower most intrinsics.
17184
17185 case Intrinsic::x86_avx512_mask_valign_q_512:
17186 case Intrinsic::x86_avx512_mask_valign_d_512:
17187 // Vector source operands are swapped.
17188 return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17189 Op.getValueType(), Op.getOperand(2),
17190 Op.getOperand(1),
17191 Op.getOperand(3)),
17192 Op.getOperand(5), Op.getOperand(4),
17193 Subtarget, DAG);
17194
17195 // ptest and testp intrinsics. The intrinsic these come from are designed to
17196 // return an integer value, not just an instruction so lower it to the ptest
17197 // or testp pattern and a setcc for the result.
17198 case Intrinsic::x86_sse41_ptestz:
17199 case Intrinsic::x86_sse41_ptestc:
17200 case Intrinsic::x86_sse41_ptestnzc:
17201 case Intrinsic::x86_avx_ptestz_256:
17202 case Intrinsic::x86_avx_ptestc_256:
17203 case Intrinsic::x86_avx_ptestnzc_256:
17204 case Intrinsic::x86_avx_vtestz_ps:
17205 case Intrinsic::x86_avx_vtestc_ps:
17206 case Intrinsic::x86_avx_vtestnzc_ps:
17207 case Intrinsic::x86_avx_vtestz_pd:
17208 case Intrinsic::x86_avx_vtestc_pd:
17209 case Intrinsic::x86_avx_vtestnzc_pd:
17210 case Intrinsic::x86_avx_vtestz_ps_256:
17211 case Intrinsic::x86_avx_vtestc_ps_256:
17212 case Intrinsic::x86_avx_vtestnzc_ps_256:
17213 case Intrinsic::x86_avx_vtestz_pd_256:
17214 case Intrinsic::x86_avx_vtestc_pd_256:
17215 case Intrinsic::x86_avx_vtestnzc_pd_256: {
17216 bool IsTestPacked = false;
17217 unsigned X86CC;
17218 switch (IntNo) {
17219 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17219)
;
17220 case Intrinsic::x86_avx_vtestz_ps:
17221 case Intrinsic::x86_avx_vtestz_pd:
17222 case Intrinsic::x86_avx_vtestz_ps_256:
17223 case Intrinsic::x86_avx_vtestz_pd_256:
17224 IsTestPacked = true; // Fallthrough
17225 case Intrinsic::x86_sse41_ptestz:
17226 case Intrinsic::x86_avx_ptestz_256:
17227 // ZF = 1
17228 X86CC = X86::COND_E;
17229 break;
17230 case Intrinsic::x86_avx_vtestc_ps:
17231 case Intrinsic::x86_avx_vtestc_pd:
17232 case Intrinsic::x86_avx_vtestc_ps_256:
17233 case Intrinsic::x86_avx_vtestc_pd_256:
17234 IsTestPacked = true; // Fallthrough
17235 case Intrinsic::x86_sse41_ptestc:
17236 case Intrinsic::x86_avx_ptestc_256:
17237 // CF = 1
17238 X86CC = X86::COND_B;
17239 break;
17240 case Intrinsic::x86_avx_vtestnzc_ps:
17241 case Intrinsic::x86_avx_vtestnzc_pd:
17242 case Intrinsic::x86_avx_vtestnzc_ps_256:
17243 case Intrinsic::x86_avx_vtestnzc_pd_256:
17244 IsTestPacked = true; // Fallthrough
17245 case Intrinsic::x86_sse41_ptestnzc:
17246 case Intrinsic::x86_avx_ptestnzc_256:
17247 // ZF and CF = 0
17248 X86CC = X86::COND_A;
17249 break;
17250 }
17251
17252 SDValue LHS = Op.getOperand(1);
17253 SDValue RHS = Op.getOperand(2);
17254 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17255 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17256 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17257 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17258 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17259 }
17260 case Intrinsic::x86_avx512_kortestz_w:
17261 case Intrinsic::x86_avx512_kortestc_w: {
17262 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17263 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17264 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17265 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17266 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17267 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17268 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17269 }
17270
17271 case Intrinsic::x86_sse42_pcmpistria128:
17272 case Intrinsic::x86_sse42_pcmpestria128:
17273 case Intrinsic::x86_sse42_pcmpistric128:
17274 case Intrinsic::x86_sse42_pcmpestric128:
17275 case Intrinsic::x86_sse42_pcmpistrio128:
17276 case Intrinsic::x86_sse42_pcmpestrio128:
17277 case Intrinsic::x86_sse42_pcmpistris128:
17278 case Intrinsic::x86_sse42_pcmpestris128:
17279 case Intrinsic::x86_sse42_pcmpistriz128:
17280 case Intrinsic::x86_sse42_pcmpestriz128: {
17281 unsigned Opcode;
17282 unsigned X86CC;
17283 switch (IntNo) {
17284 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17284)
; // Can't reach here.
17285 case Intrinsic::x86_sse42_pcmpistria128:
17286 Opcode = X86ISD::PCMPISTRI;
17287 X86CC = X86::COND_A;
17288 break;
17289 case Intrinsic::x86_sse42_pcmpestria128:
17290 Opcode = X86ISD::PCMPESTRI;
17291 X86CC = X86::COND_A;
17292 break;
17293 case Intrinsic::x86_sse42_pcmpistric128:
17294 Opcode = X86ISD::PCMPISTRI;
17295 X86CC = X86::COND_B;
17296 break;
17297 case Intrinsic::x86_sse42_pcmpestric128:
17298 Opcode = X86ISD::PCMPESTRI;
17299 X86CC = X86::COND_B;
17300 break;
17301 case Intrinsic::x86_sse42_pcmpistrio128:
17302 Opcode = X86ISD::PCMPISTRI;
17303 X86CC = X86::COND_O;
17304 break;
17305 case Intrinsic::x86_sse42_pcmpestrio128:
17306 Opcode = X86ISD::PCMPESTRI;
17307 X86CC = X86::COND_O;
17308 break;
17309 case Intrinsic::x86_sse42_pcmpistris128:
17310 Opcode = X86ISD::PCMPISTRI;
17311 X86CC = X86::COND_S;
17312 break;
17313 case Intrinsic::x86_sse42_pcmpestris128:
17314 Opcode = X86ISD::PCMPESTRI;
17315 X86CC = X86::COND_S;
17316 break;
17317 case Intrinsic::x86_sse42_pcmpistriz128:
17318 Opcode = X86ISD::PCMPISTRI;
17319 X86CC = X86::COND_E;
17320 break;
17321 case Intrinsic::x86_sse42_pcmpestriz128:
17322 Opcode = X86ISD::PCMPESTRI;
17323 X86CC = X86::COND_E;
17324 break;
17325 }
17326 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17327 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17328 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17329 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17330 DAG.getConstant(X86CC, MVT::i8),
17331 SDValue(PCMP.getNode(), 1));
17332 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17333 }
17334
17335 case Intrinsic::x86_sse42_pcmpistri128:
17336 case Intrinsic::x86_sse42_pcmpestri128: {
17337 unsigned Opcode;
17338 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17339 Opcode = X86ISD::PCMPISTRI;
17340 else
17341 Opcode = X86ISD::PCMPESTRI;
17342
17343 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17344 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17345 return DAG.getNode(Opcode, dl, VTs, NewOps);
17346 }
17347 }
17348}
17349
17350static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17351 SDValue Src, SDValue Mask, SDValue Base,
17352 SDValue Index, SDValue ScaleOp, SDValue Chain,
17353 const X86Subtarget * Subtarget) {
17354 SDLoc dl(Op);
17355 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17356 assert(C && "Invalid scale type")((C && "Invalid scale type") ? static_cast<void>
(0) : __assert_fail ("C && \"Invalid scale type\"", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17356, __PRETTY_FUNCTION__))
;
17357 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17358 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17359 Index.getSimpleValueType().getVectorNumElements());
17360 SDValue MaskInReg;
17361 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17362 if (MaskC)
17363 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17364 else
17365 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17366 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17367 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17368 SDValue Segment = DAG.getRegister(0, MVT::i32);
17369 if (Src.getOpcode() == ISD::UNDEF)
17370 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17371 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17372 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17373 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17374 return DAG.getMergeValues(RetOps, dl);
17375}
17376
17377static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17378 SDValue Src, SDValue Mask, SDValue Base,
17379 SDValue Index, SDValue ScaleOp, SDValue Chain) {
17380 SDLoc dl(Op);
17381 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17382 assert(C && "Invalid scale type")((C && "Invalid scale type") ? static_cast<void>
(0) : __assert_fail ("C && \"Invalid scale type\"", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17382, __PRETTY_FUNCTION__))
;
17383 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17384 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17385 SDValue Segment = DAG.getRegister(0, MVT::i32);
17386 EVT MaskVT = MVT::getVectorVT(MVT::i1,
17387 Index.getSimpleValueType().getVectorNumElements());
17388 SDValue MaskInReg;
17389 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17390 if (MaskC)
17391 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17392 else
17393 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17394 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17395 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17396 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17397 return SDValue(Res, 1);
17398}
17399
17400static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17401 SDValue Mask, SDValue Base, SDValue Index,
17402 SDValue ScaleOp, SDValue Chain) {
17403 SDLoc dl(Op);
17404 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17405 assert(C && "Invalid scale type")((C && "Invalid scale type") ? static_cast<void>
(0) : __assert_fail ("C && \"Invalid scale type\"", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17405, __PRETTY_FUNCTION__))
;
17406 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17407 SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17408 SDValue Segment = DAG.getRegister(0, MVT::i32);
17409 EVT MaskVT =
17410 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17411 SDValue MaskInReg;
17412 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17413 if (MaskC)
17414 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17415 else
17416 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17417 //SDVTList VTs = DAG.getVTList(MVT::Other);
17418 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17419 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17420 return SDValue(Res, 0);
17421}
17422
17423// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17424// read performance monitor counters (x86_rdpmc).
17425static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17426 SelectionDAG &DAG, const X86Subtarget *Subtarget,
17427 SmallVectorImpl<SDValue> &Results) {
17428 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17428, __PRETTY_FUNCTION__))
;
17429 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17430 SDValue LO, HI;
17431
17432 // The ECX register is used to select the index of the performance counter
17433 // to read.
17434 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17435 N->getOperand(2));
17436 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17437
17438 // Reads the content of a 64-bit performance counter and returns it in the
17439 // registers EDX:EAX.
17440 if (Subtarget->is64Bit()) {
17441 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17442 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17443 LO.getValue(2));
17444 } else {
17445 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17446 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17447 LO.getValue(2));
17448 }
17449 Chain = HI.getValue(1);
17450
17451 if (Subtarget->is64Bit()) {
17452 // The EAX register is loaded with the low-order 32 bits. The EDX register
17453 // is loaded with the supported high-order bits of the counter.
17454 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17455 DAG.getConstant(32, MVT::i8));
17456 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17457 Results.push_back(Chain);
17458 return;
17459 }
17460
17461 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17462 SDValue Ops[] = { LO, HI };
17463 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17464 Results.push_back(Pair);
17465 Results.push_back(Chain);
17466}
17467
17468// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17469// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17470// also used to custom lower READCYCLECOUNTER nodes.
17471static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17472 SelectionDAG &DAG, const X86Subtarget *Subtarget,
17473 SmallVectorImpl<SDValue> &Results) {
17474 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17475 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17476 SDValue LO, HI;
17477
17478 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17479 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17480 // and the EAX register is loaded with the low-order 32 bits.
17481 if (Subtarget->is64Bit()) {
17482 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17483 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17484 LO.getValue(2));
17485 } else {
17486 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17487 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17488 LO.getValue(2));
17489 }
17490 SDValue Chain = HI.getValue(1);
17491
17492 if (Opcode == X86ISD::RDTSCP_DAG) {
17493 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17493, __PRETTY_FUNCTION__))
;
17494
17495 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17496 // the ECX register. Add 'ecx' explicitly to the chain.
17497 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17498 HI.getValue(2));
17499 // Explicitly store the content of ECX at the location passed in input
17500 // to the 'rdtscp' intrinsic.
17501 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17502 MachinePointerInfo(), false, false, 0);
17503 }
17504
17505 if (Subtarget->is64Bit()) {
17506 // The EDX register is loaded with the high-order 32 bits of the MSR, and
17507 // the EAX register is loaded with the low-order 32 bits.
17508 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17509 DAG.getConstant(32, MVT::i8));
17510 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17511 Results.push_back(Chain);
17512 return;
17513 }
17514
17515 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17516 SDValue Ops[] = { LO, HI };
17517 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17518 Results.push_back(Pair);
17519 Results.push_back(Chain);
17520}
17521
17522static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17523 SelectionDAG &DAG) {
17524 SmallVector<SDValue, 2> Results;
17525 SDLoc DL(Op);
17526 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17527 Results);
17528 return DAG.getMergeValues(Results, DL);
17529}
17530
17531
17532static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17533 SelectionDAG &DAG) {
17534 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17535
17536 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17537 if (!IntrData)
17538 return SDValue();
17539
17540 SDLoc dl(Op);
17541 switch(IntrData->Type) {
17542 default:
17543 llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17543)
;
17544 break;
17545 case RDSEED:
17546 case RDRAND: {
17547 // Emit the node with the right value type.
17548 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17549 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17550
17551 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17552 // Otherwise return the value from Rand, which is always 0, casted to i32.
17553 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17554 DAG.getConstant(1, Op->getValueType(1)),
17555 DAG.getConstant(X86::COND_B, MVT::i32),
17556 SDValue(Result.getNode(), 1) };
17557 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17558 DAG.getVTList(Op->getValueType(1), MVT::Glue),
17559 Ops);
17560
17561 // Return { result, isValid, chain }.
17562 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17563 SDValue(Result.getNode(), 2));
17564 }
17565 case GATHER: {
17566 //gather(v1, mask, index, base, scale);
17567 SDValue Chain = Op.getOperand(0);
17568 SDValue Src = Op.getOperand(2);
17569 SDValue Base = Op.getOperand(3);
17570 SDValue Index = Op.getOperand(4);
17571 SDValue Mask = Op.getOperand(5);
17572 SDValue Scale = Op.getOperand(6);
17573 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17574 Subtarget);
17575 }
17576 case SCATTER: {
17577 //scatter(base, mask, index, v1, scale);
17578 SDValue Chain = Op.getOperand(0);
17579 SDValue Base = Op.getOperand(2);
17580 SDValue Mask = Op.getOperand(3);
17581 SDValue Index = Op.getOperand(4);
17582 SDValue Src = Op.getOperand(5);
17583 SDValue Scale = Op.getOperand(6);
17584 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17585 }
17586 case PREFETCH: {
17587 SDValue Hint = Op.getOperand(6);
17588 unsigned HintVal;
17589 if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17590 (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17591 llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1")::llvm::llvm_unreachable_internal("Wrong prefetch hint in intrinsic: should be 0 or 1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17591)
;
17592 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17593 SDValue Chain = Op.getOperand(0);
17594 SDValue Mask = Op.getOperand(2);
17595 SDValue Index = Op.getOperand(3);
17596 SDValue Base = Op.getOperand(4);
17597 SDValue Scale = Op.getOperand(5);
17598 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17599 }
17600 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17601 case RDTSC: {
17602 SmallVector<SDValue, 2> Results;
17603 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17604 return DAG.getMergeValues(Results, dl);
17605 }
17606 // Read Performance Monitoring Counters.
17607 case RDPMC: {
17608 SmallVector<SDValue, 2> Results;
17609 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17610 return DAG.getMergeValues(Results, dl);
17611 }
17612 // XTEST intrinsics.
17613 case XTEST: {
17614 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17615 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17616 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17617 DAG.getConstant(X86::COND_NE, MVT::i8),
17618 InTrans);
17619 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17620 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17621 Ret, SDValue(InTrans.getNode(), 1));
17622 }
17623 // ADC/ADCX/SBB
17624 case ADX: {
17625 SmallVector<SDValue, 2> Results;
17626 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17627 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17628 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17629 DAG.getConstant(-1, MVT::i8));
17630 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17631 Op.getOperand(4), GenCF.getValue(1));
17632 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17633 Op.getOperand(5), MachinePointerInfo(),
17634 false, false, 0);
17635 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17636 DAG.getConstant(X86::COND_B, MVT::i8),
17637 Res.getValue(1));
17638 Results.push_back(SetCC);
17639 Results.push_back(Store);
17640 return DAG.getMergeValues(Results, dl);
17641 }
17642 case COMPRESS_TO_MEM: {
17643 SDLoc dl(Op);
17644 SDValue Mask = Op.getOperand(4);
17645 SDValue DataToCompress = Op.getOperand(3);
17646 SDValue Addr = Op.getOperand(2);
17647 SDValue Chain = Op.getOperand(0);
17648
17649 if (isAllOnes(Mask)) // return just a store
17650 return DAG.getStore(Chain, dl, DataToCompress, Addr,
17651 MachinePointerInfo(), false, false, 0);
17652
17653 EVT VT = DataToCompress.getValueType();
17654 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17655 VT.getVectorNumElements());
17656 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17657 Mask.getValueType().getSizeInBits());
17658 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17659 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17660 DAG.getIntPtrConstant(0));
17661
17662 SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17663 DataToCompress, DAG.getUNDEF(VT));
17664 return DAG.getStore(Chain, dl, Compressed, Addr,
17665 MachinePointerInfo(), false, false, 0);
17666 }
17667 case EXPAND_FROM_MEM: {
17668 SDLoc dl(Op);
17669 SDValue Mask = Op.getOperand(4);
17670 SDValue PathThru = Op.getOperand(3);
17671 SDValue Addr = Op.getOperand(2);
17672 SDValue Chain = Op.getOperand(0);
17673 EVT VT = Op.getValueType();
17674
17675 if (isAllOnes(Mask)) // return just a load
17676 return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17677 false, 0);
17678 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17679 VT.getVectorNumElements());
17680 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17681 Mask.getValueType().getSizeInBits());
17682 SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17683 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17684 DAG.getIntPtrConstant(0));
17685
17686 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17687 false, false, false, 0);
17688
17689 SmallVector<SDValue, 2> Results;
17690 Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17691 PathThru));
17692 Results.push_back(Chain);
17693 return DAG.getMergeValues(Results, dl);
17694 }
17695 }
17696}
17697
17698SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17699 SelectionDAG &DAG) const {
17700 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17701 MFI->setReturnAddressIsTaken(true);
17702
17703 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17704 return SDValue();
17705
17706 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17707 SDLoc dl(Op);
17708 EVT PtrVT = getPointerTy();
17709
17710 if (Depth > 0) {
17711 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17712 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17713 DAG.getSubtarget().getRegisterInfo());
17714 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17715 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17716 DAG.getNode(ISD::ADD, dl, PtrVT,
17717 FrameAddr, Offset),
17718 MachinePointerInfo(), false, false, false, 0);
17719 }
17720
17721 // Just load the return address.
17722 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17723 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17724 RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17725}
17726
17727SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17728 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17729 MFI->setFrameAddressIsTaken(true);
17730
17731 EVT VT = Op.getValueType();
17732 SDLoc dl(Op); // FIXME probably not meaningful
17733 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17734 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17735 DAG.getSubtarget().getRegisterInfo());
17736 unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17737 DAG.getMachineFunction());
17738 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__))
17739 (FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__))
17740 "Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17740, __PRETTY_FUNCTION__))
;
17741 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17742 while (Depth--)
17743 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17744 MachinePointerInfo(),
17745 false, false, false, 0);
17746 return FrameAddr;
17747}
17748
17749// FIXME? Maybe this could be a TableGen attribute on some registers and
17750// this table could be generated automatically from RegInfo.
17751unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17752 EVT VT) const {
17753 unsigned Reg = StringSwitch<unsigned>(RegName)
17754 .Case("esp", X86::ESP)
17755 .Case("rsp", X86::RSP)
17756 .Default(0);
17757 if (Reg)
17758 return Reg;
17759 report_fatal_error("Invalid register name global variable");
17760}
17761
17762SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17763 SelectionDAG &DAG) const {
17764 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17765 DAG.getSubtarget().getRegisterInfo());
17766 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17767}
17768
17769SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17770 SDValue Chain = Op.getOperand(0);
17771 SDValue Offset = Op.getOperand(1);
17772 SDValue Handler = Op.getOperand(2);
17773 SDLoc dl (Op);
17774
17775 EVT PtrVT = getPointerTy();
17776 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17777 DAG.getSubtarget().getRegisterInfo());
17778 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
17779 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))
17780 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))
17781 "Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))
;
17782 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
17783 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
17784
17785 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
17786 DAG.getIntPtrConstant(RegInfo->getSlotSize()));
17787 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
17788 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
17789 false, false, 0);
17790 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
17791
17792 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
17793 DAG.getRegister(StoreAddrReg, PtrVT));
17794}
17795
17796SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
17797 SelectionDAG &DAG) const {
17798 SDLoc DL(Op);
17799 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
17800 DAG.getVTList(MVT::i32, MVT::Other),
17801 Op.getOperand(0), Op.getOperand(1));
17802}
17803
17804SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
17805 SelectionDAG &DAG) const {
17806 SDLoc DL(Op);
17807 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
17808 Op.getOperand(0), Op.getOperand(1));
17809}
17810
17811static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
17812 return Op.getOperand(0);
17813}
17814
17815SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
17816 SelectionDAG &DAG) const {
17817 SDValue Root = Op.getOperand(0);
17818 SDValue Trmp = Op.getOperand(1); // trampoline
17819 SDValue FPtr = Op.getOperand(2); // nested function
17820 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
17821 SDLoc dl (Op);
17822
17823 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17824 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
17825
17826 if (Subtarget->is64Bit()) {
17827 SDValue OutChains[6];
17828
17829 // Large code-model.
17830 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
17831 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
17832
17833 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
17834 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
17835
17836 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
17837
17838 // Load the pointer to the nested function into R11.
17839 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
17840 SDValue Addr = Trmp;
17841 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17842 Addr, MachinePointerInfo(TrmpAddr),
17843 false, false, 0);
17844
17845 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17846 DAG.getConstant(2, MVT::i64));
17847 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
17848 MachinePointerInfo(TrmpAddr, 2),
17849 false, false, 2);
17850
17851 // Load the 'nest' parameter value into R10.
17852 // R10 is specified in X86CallingConv.td
17853 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
17854 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17855 DAG.getConstant(10, MVT::i64));
17856 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17857 Addr, MachinePointerInfo(TrmpAddr, 10),
17858 false, false, 0);
17859
17860 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17861 DAG.getConstant(12, MVT::i64));
17862 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
17863 MachinePointerInfo(TrmpAddr, 12),
17864 false, false, 2);
17865
17866 // Jump to the nested function.
17867 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
17868 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17869 DAG.getConstant(20, MVT::i64));
17870 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17871 Addr, MachinePointerInfo(TrmpAddr, 20),
17872 false, false, 0);
17873
17874 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
17875 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17876 DAG.getConstant(22, MVT::i64));
17877 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
17878 MachinePointerInfo(TrmpAddr, 22),
17879 false, false, 0);
17880
17881 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17882 } else {
17883 const Function *Func =
17884 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
17885 CallingConv::ID CC = Func->getCallingConv();
17886 unsigned NestReg;
17887
17888 switch (CC) {
17889 default:
17890 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 17890)
;
17891 case CallingConv::C:
17892 case CallingConv::X86_StdCall: {
17893 // Pass 'nest' parameter in ECX.
17894 // Must be kept in sync with X86CallingConv.td
17895 NestReg = X86::ECX;
17896
17897 // Check that ECX wasn't needed by an 'inreg' parameter.
17898 FunctionType *FTy = Func->getFunctionType();
17899 const AttributeSet &Attrs = Func->getAttributes();
17900
17901 if (!Attrs.isEmpty() && !Func->isVarArg()) {
17902 unsigned InRegCount = 0;
17903 unsigned Idx = 1;
17904
17905 for (FunctionType::param_iterator I = FTy->param_begin(),
17906 E = FTy->param_end(); I != E; ++I, ++Idx)
17907 if (Attrs.hasAttribute(Idx, Attribute::InReg))
17908 // FIXME: should only count parameters that are lowered to integers.
17909 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
17910
17911 if (InRegCount > 2) {
17912 report_fatal_error("Nest register in use - reduce number of inreg"
17913 " parameters!");
17914 }
17915 }
17916 break;
17917 }
17918 case CallingConv::X86_FastCall:
17919 case CallingConv::X86_ThisCall:
17920 case CallingConv::Fast:
17921 // Pass 'nest' parameter in EAX.
17922 // Must be kept in sync with X86CallingConv.td
17923 NestReg = X86::EAX;
17924 break;
17925 }
17926
17927 SDValue OutChains[4];
17928 SDValue Addr, Disp;
17929
17930 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17931 DAG.getConstant(10, MVT::i32));
17932 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
17933
17934 // This is storing the opcode for MOV32ri.
17935 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
17936 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
17937 OutChains[0] = DAG.getStore(Root, dl,
17938 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
17939 Trmp, MachinePointerInfo(TrmpAddr),
17940 false, false, 0);
17941
17942 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17943 DAG.getConstant(1, MVT::i32));
17944 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
17945 MachinePointerInfo(TrmpAddr, 1),
17946 false, false, 1);
17947
17948 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
17949 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17950 DAG.getConstant(5, MVT::i32));
17951 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
17952 MachinePointerInfo(TrmpAddr, 5),
17953 false, false, 1);
17954
17955 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17956 DAG.getConstant(6, MVT::i32));
17957 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
17958 MachinePointerInfo(TrmpAddr, 6),
17959 false, false, 1);
17960
17961 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17962 }
17963}
17964
17965SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
17966 SelectionDAG &DAG) const {
17967 /*
17968 The rounding mode is in bits 11:10 of FPSR, and has the following
17969 settings:
17970 00 Round to nearest
17971 01 Round to -inf
17972 10 Round to +inf
17973 11 Round to 0
17974
17975 FLT_ROUNDS, on the other hand, expects the following:
17976 -1 Undefined
17977 0 Round to 0
17978 1 Round to nearest
17979 2 Round to +inf
17980 3 Round to -inf
17981
17982 To perform the conversion, we do:
17983 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
17984 */
17985
17986 MachineFunction &MF = DAG.getMachineFunction();
17987 const TargetMachine &TM = MF.getTarget();
17988 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
17989 unsigned StackAlignment = TFI.getStackAlignment();
17990 MVT VT = Op.getSimpleValueType();
17991 SDLoc DL(Op);
17992
17993 // Save FP Control Word to stack slot
17994 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
17995 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
17996
17997 MachineMemOperand *MMO =
17998 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
17999 MachineMemOperand::MOStore, 2, 2);
18000
18001 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18002 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18003 DAG.getVTList(MVT::Other),
18004 Ops, MVT::i16, MMO);
18005
18006 // Load FP Control Word from stack slot
18007 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18008 MachinePointerInfo(), false, false, false, 0);
18009
18010 // Transform as necessary
18011 SDValue CWD1 =
18012 DAG.getNode(ISD::SRL, DL, MVT::i16,
18013 DAG.getNode(ISD::AND, DL, MVT::i16,
18014 CWD, DAG.getConstant(0x800, MVT::i16)),
18015 DAG.getConstant(11, MVT::i8));
18016 SDValue CWD2 =
18017 DAG.getNode(ISD::SRL, DL, MVT::i16,
18018 DAG.getNode(ISD::AND, DL, MVT::i16,
18019 CWD, DAG.getConstant(0x400, MVT::i16)),
18020 DAG.getConstant(9, MVT::i8));
18021
18022 SDValue RetVal =
18023 DAG.getNode(ISD::AND, DL, MVT::i16,
18024 DAG.getNode(ISD::ADD, DL, MVT::i16,
18025 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18026 DAG.getConstant(1, MVT::i16)),
18027 DAG.getConstant(3, MVT::i16));
18028
18029 return DAG.getNode((VT.getSizeInBits() < 16 ?
18030 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18031}
18032
18033static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18034 MVT VT = Op.getSimpleValueType();
18035 EVT OpVT = VT;
18036 unsigned NumBits = VT.getSizeInBits();
18037 SDLoc dl(Op);
18038
18039 Op = Op.getOperand(0);
18040 if (VT == MVT::i8) {
18041 // Zero extend to i32 since there is not an i8 bsr.
18042 OpVT = MVT::i32;
18043 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18044 }
18045
18046 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18047 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18048 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18049
18050 // If src is zero (i.e. bsr sets ZF), returns NumBits.
18051 SDValue Ops[] = {
18052 Op,
18053 DAG.getConstant(NumBits+NumBits-1, OpVT),
18054 DAG.getConstant(X86::COND_E, MVT::i8),
18055 Op.getValue(1)
18056 };
18057 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18058
18059 // Finally xor with NumBits-1.
18060 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18061
18062 if (VT == MVT::i8)
18063 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18064 return Op;
18065}
18066
18067static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18068 MVT VT = Op.getSimpleValueType();
18069 EVT OpVT = VT;
18070 unsigned NumBits = VT.getSizeInBits();
18071 SDLoc dl(Op);
18072
18073 Op = Op.getOperand(0);
18074 if (VT == MVT::i8) {
18075 // Zero extend to i32 since there is not an i8 bsr.
18076 OpVT = MVT::i32;
18077 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18078 }
18079
18080 // Issue a bsr (scan bits in reverse).
18081 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18082 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18083
18084 // And xor with NumBits-1.
18085 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18086
18087 if (VT == MVT::i8)
18088 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18089 return Op;
18090}
18091
18092static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18093 MVT VT = Op.getSimpleValueType();
18094 unsigned NumBits = VT.getSizeInBits();
18095 SDLoc dl(Op);
18096 Op = Op.getOperand(0);
18097
18098 // Issue a bsf (scan bits forward) which also sets EFLAGS.
18099 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18100 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18101
18102 // If src is zero (i.e. bsf sets ZF), returns NumBits.
18103 SDValue Ops[] = {
18104 Op,
18105 DAG.getConstant(NumBits, VT),
18106 DAG.getConstant(X86::COND_E, MVT::i8),
18107 Op.getValue(1)
18108 };
18109 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18110}
18111
18112// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18113// ones, and then concatenate the result back.
18114static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18115 MVT VT = Op.getSimpleValueType();
18116
18117 assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18118, __PRETTY_FUNCTION__))
18118 "Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18118, __PRETTY_FUNCTION__))
;
18119
18120 unsigned NumElems = VT.getVectorNumElements();
18121 SDLoc dl(Op);
18122
18123 // Extract the LHS vectors
18124 SDValue LHS = Op.getOperand(0);
18125 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18126 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18127
18128 // Extract the RHS vectors
18129 SDValue RHS = Op.getOperand(1);
18130 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18131 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18132
18133 MVT EltVT = VT.getVectorElementType();
18134 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18135
18136 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18137 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18138 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18139}
18140
18141static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18142 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__))
18143 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__))
18144 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18144, __PRETTY_FUNCTION__))
;
18145 return Lower256IntArith(Op, DAG);
18146}
18147
18148static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18149 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18151, __PRETTY_FUNCTION__))
18150 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18151, __PRETTY_FUNCTION__))
18151 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18151, __PRETTY_FUNCTION__))
;
18152 return Lower256IntArith(Op, DAG);
18153}
18154
18155static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18156 SelectionDAG &DAG) {
18157 SDLoc dl(Op);
18158 MVT VT = Op.getSimpleValueType();
18159
18160 // Decompose 256-bit ops into smaller 128-bit ops.
18161 if (VT.is256BitVector() && !Subtarget->hasInt256())
18162 return Lower256IntArith(Op, DAG);
18163
18164 SDValue A = Op.getOperand(0);
18165 SDValue B = Op.getOperand(1);
18166
18167 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18168 if (VT == MVT::v4i32) {
18169 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&((Subtarget->hasSSE2() && !Subtarget->hasSSE41(
) && "Should not custom lower when pmuldq is available!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !Subtarget->hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18170, __PRETTY_FUNCTION__))
18170 "Should not custom lower when pmuldq is available!")((Subtarget->hasSSE2() && !Subtarget->hasSSE41(
) && "Should not custom lower when pmuldq is available!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && !Subtarget->hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18170, __PRETTY_FUNCTION__))
;
18171
18172 // Extract the odd parts.
18173 static const int UnpackMask[] = { 1, -1, 3, -1 };
18174 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18175 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18176
18177 // Multiply the even parts.
18178 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18179 // Now multiply odd parts.
18180 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18181
18182 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18183 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18184
18185 // Merge the two vectors back together with a shuffle. This expands into 2
18186 // shuffles.
18187 static const int ShufMask[] = { 0, 4, 2, 6 };
18188 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18189 }
18190
18191 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18192, __PRETTY_FUNCTION__))
18192 "Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18192, __PRETTY_FUNCTION__))
;
18193
18194 // Ahi = psrlqi(a, 32);
18195 // Bhi = psrlqi(b, 32);
18196 //
18197 // AloBlo = pmuludq(a, b);
18198 // AloBhi = pmuludq(a, Bhi);
18199 // AhiBlo = pmuludq(Ahi, b);
18200
18201 // AloBhi = psllqi(AloBhi, 32);
18202 // AhiBlo = psllqi(AhiBlo, 32);
18203 // return AloBlo + AloBhi + AhiBlo;
18204
18205 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18206 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18207
18208 // Bit cast to 32-bit vectors for MULUDQ
18209 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18210 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18211 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18212 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18213 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18214 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18215
18216 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18217 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18218 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18219
18220 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18221 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18222
18223 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18224 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18225}
18226
18227SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18228 assert(Subtarget->isTargetWin64() && "Unexpected target")((Subtarget->isTargetWin64() && "Unexpected target"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetWin64() && \"Unexpected target\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18228, __PRETTY_FUNCTION__))
;
18229 EVT VT = Op.getValueType();
18230 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18231, __PRETTY_FUNCTION__))
18231 "Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18231, __PRETTY_FUNCTION__))
;
18232
18233 RTLIB::Libcall LC;
18234 bool isSigned;
18235 switch (Op->getOpcode()) {
18236 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18236)
;
18237 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
18238 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
18239 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
18240 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
18241 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
18242 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18243 }
18244
18245 SDLoc dl(Op);
18246 SDValue InChain = DAG.getEntryNode();
18247
18248 TargetLowering::ArgListTy Args;
18249 TargetLowering::ArgListEntry Entry;
18250 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18251 EVT ArgVT = Op->getOperand(i).getValueType();
18252 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18253, __PRETTY_FUNCTION__))
18253 "Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18253, __PRETTY_FUNCTION__))
;
18254 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18255 Entry.Node = StackPtr;
18256 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18257 false, false, 16);
18258 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18259 Entry.Ty = PointerType::get(ArgTy,0);
18260 Entry.isSExt = false;
18261 Entry.isZExt = false;
18262 Args.push_back(Entry);
18263 }
18264
18265 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18266 getPointerTy());
18267
18268 TargetLowering::CallLoweringInfo CLI(DAG);
18269 CLI.setDebugLoc(dl).setChain(InChain)
18270 .setCallee(getLibcallCallingConv(LC),
18271 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18272 Callee, std::move(Args), 0)
18273 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18274
18275 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18276 return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18277}
18278
18279static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18280 SelectionDAG &DAG) {
18281 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18282 EVT VT = Op0.getValueType();
18283 SDLoc dl(Op);
18284
18285 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT
== MVT::v8i32 && Subtarget->hasInt256())) ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18286, __PRETTY_FUNCTION__))
18286 (VT == MVT::v8i32 && Subtarget->hasInt256()))(((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT
== MVT::v8i32 && Subtarget->hasInt256())) ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18286, __PRETTY_FUNCTION__))
;
18287
18288 // PMULxD operations multiply each even value (starting at 0) of LHS with
18289 // the related value of RHS and produce a widen result.
18290 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18291 // => <2 x i64> <ae|cg>
18292 //
18293 // In other word, to have all the results, we need to perform two PMULxD:
18294 // 1. one with the even values.
18295 // 2. one with the odd values.
18296 // To achieve #2, with need to place the odd values at an even position.
18297 //
18298 // Place the odd value at an even position (basically, shift all values 1
18299 // step to the left):
18300 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18301 // <a|b|c|d> => <b|undef|d|undef>
18302 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18303 // <e|f|g|h> => <f|undef|h|undef>
18304 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18305
18306 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18307 // ints.
18308 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18309 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18310 unsigned Opcode =
18311 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18312 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18313 // => <2 x i64> <ae|cg>
18314 SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18315 DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18316 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18317 // => <2 x i64> <bf|dh>
18318 SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18319 DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18320
18321 // Shuffle it back into the right order.
18322 SDValue Highs, Lows;
18323 if (VT == MVT::v8i32) {
18324 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18325 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18326 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18327 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18328 } else {
18329 const int HighMask[] = {1, 5, 3, 7};
18330 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18331 const int LowMask[] = {0, 4, 2, 6};
18332 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18333 }
18334
18335 // If we have a signed multiply but no PMULDQ fix up the high parts of a
18336 // unsigned multiply.
18337 if (IsSigned && !Subtarget->hasSSE41()) {
18338 SDValue ShAmt =
18339 DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18340 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18341 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18342 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18343 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18344
18345 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18346 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18347 }
18348
18349 // The first result of MUL_LOHI is actually the low value, followed by the
18350 // high value.
18351 SDValue Ops[] = {Lows, Highs};
18352 return DAG.getMergeValues(Ops, dl);
18353}
18354
18355static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18356 const X86Subtarget *Subtarget) {
18357 MVT VT = Op.getSimpleValueType();
18358 SDLoc dl(Op);
18359 SDValue R = Op.getOperand(0);
18360 SDValue Amt = Op.getOperand(1);
18361
18362 // Optimize shl/srl/sra with constant shift amount.
18363 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18364 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18365 uint64_t ShiftAmt = ShiftConst->getZExtValue();
18366
18367 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18368 (Subtarget->hasInt256() &&
18369 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18370 (Subtarget->hasAVX512() &&
18371 (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18372 if (Op.getOpcode() == ISD::SHL)
18373 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18374 DAG);
18375 if (Op.getOpcode() == ISD::SRL)
18376 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18377 DAG);
18378 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18379 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18380 DAG);
18381 }
18382
18383 if (VT == MVT::v16i8) {
18384 if (Op.getOpcode() == ISD::SHL) {
18385 // Make a large shift.
18386 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18387 MVT::v8i16, R, ShiftAmt,
18388 DAG);
18389 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18390 // Zero out the rightmost bits.
18391 SmallVector<SDValue, 16> V(16,
18392 DAG.getConstant(uint8_t(-1U << ShiftAmt),
18393 MVT::i8));
18394 return DAG.getNode(ISD::AND, dl, VT, SHL,
18395 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18396 }
18397 if (Op.getOpcode() == ISD::SRL) {
18398 // Make a large shift.
18399 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18400 MVT::v8i16, R, ShiftAmt,
18401 DAG);
18402 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18403 // Zero out the leftmost bits.
18404 SmallVector<SDValue, 16> V(16,
18405 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18406 MVT::i8));
18407 return DAG.getNode(ISD::AND, dl, VT, SRL,
18408 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18409 }
18410 if (Op.getOpcode() == ISD::SRA) {
18411 if (ShiftAmt == 7) {
18412 // R s>> 7 === R s< 0
18413 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18414 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18415 }
18416
18417 // R s>> a === ((R u>> a) ^ m) - m
18418 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18419 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18420 MVT::i8));
18421 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18422 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18423 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18424 return Res;
18425 }
18426 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18426)
;
18427 }
18428
18429 if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18430 if (Op.getOpcode() == ISD::SHL) {
18431 // Make a large shift.
18432 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18433 MVT::v16i16, R, ShiftAmt,
18434 DAG);
18435 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18436 // Zero out the rightmost bits.
18437 SmallVector<SDValue, 32> V(32,
18438 DAG.getConstant(uint8_t(-1U << ShiftAmt),
18439 MVT::i8));
18440 return DAG.getNode(ISD::AND, dl, VT, SHL,
18441 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18442 }
18443 if (Op.getOpcode() == ISD::SRL) {
18444 // Make a large shift.
18445 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18446 MVT::v16i16, R, ShiftAmt,
18447 DAG);
18448 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18449 // Zero out the leftmost bits.
18450 SmallVector<SDValue, 32> V(32,
18451 DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18452 MVT::i8));
18453 return DAG.getNode(ISD::AND, dl, VT, SRL,
18454 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18455 }
18456 if (Op.getOpcode() == ISD::SRA) {
18457 if (ShiftAmt == 7) {
18458 // R s>> 7 === R s< 0
18459 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18460 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18461 }
18462
18463 // R s>> a === ((R u>> a) ^ m) - m
18464 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18465 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18466 MVT::i8));
18467 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18468 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18469 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18470 return Res;
18471 }
18472 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18472)
;
18473 }
18474 }
18475 }
18476
18477 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18478 if (!Subtarget->is64Bit() &&
18479 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18480 Amt.getOpcode() == ISD::BITCAST &&
18481 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18482 Amt = Amt.getOperand(0);
18483 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18484 VT.getVectorNumElements();
18485 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18486 uint64_t ShiftAmt = 0;
18487 for (unsigned i = 0; i != Ratio; ++i) {
18488 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18489 if (!C)
18490 return SDValue();
18491 // 6 == Log2(64)
18492 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18493 }
18494 // Check remaining shift amounts.
18495 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18496 uint64_t ShAmt = 0;
18497 for (unsigned j = 0; j != Ratio; ++j) {
18498 ConstantSDNode *C =
18499 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18500 if (!C)
18501 return SDValue();
18502 // 6 == Log2(64)
18503 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18504 }
18505 if (ShAmt != ShiftAmt)
18506 return SDValue();
18507 }
18508 switch (Op.getOpcode()) {
18509 default:
18510 llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18510)
;
18511 case ISD::SHL:
18512 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18513 DAG);
18514 case ISD::SRL:
18515 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18516 DAG);
18517 case ISD::SRA:
18518 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18519 DAG);
18520 }
18521 }
18522
18523 return SDValue();
18524}
18525
18526static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18527 const X86Subtarget* Subtarget) {
18528 MVT VT = Op.getSimpleValueType();
18529 SDLoc dl(Op);
18530 SDValue R = Op.getOperand(0);
18531 SDValue Amt = Op.getOperand(1);
18532
18533 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18534 VT == MVT::v4i32 || VT == MVT::v8i16 ||
18535 (Subtarget->hasInt256() &&
18536 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18537 VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18538 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18539 SDValue BaseShAmt;
18540 EVT EltVT = VT.getVectorElementType();
18541
18542 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18543 // Check if this build_vector node is doing a splat.
18544 // If so, then set BaseShAmt equal to the splat value.
18545 BaseShAmt = BV->getSplatValue();
18546 if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18547 BaseShAmt = SDValue();
18548 } else {
18549 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18550 Amt = Amt.getOperand(0);
18551
18552 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18553 if (SVN && SVN->isSplat()) {
18554 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18555 SDValue InVec = Amt.getOperand(0);
18556 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18557 assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&(((SplatIdx < InVec.getValueType().getVectorNumElements())
&& "Unexpected shuffle index found!") ? static_cast<
void> (0) : __assert_fail ("(SplatIdx < InVec.getValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18558, __PRETTY_FUNCTION__))
18558 "Unexpected shuffle index found!")(((SplatIdx < InVec.getValueType().getVectorNumElements())
&& "Unexpected shuffle index found!") ? static_cast<
void> (0) : __assert_fail ("(SplatIdx < InVec.getValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18558, __PRETTY_FUNCTION__))
;
18559 BaseShAmt = InVec.getOperand(SplatIdx);
18560 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18561 if (ConstantSDNode *C =
18562 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18563 if (C->getZExtValue() == SplatIdx)
18564 BaseShAmt = InVec.getOperand(1);
18565 }
18566 }
18567
18568 if (!BaseShAmt)
18569 // Avoid introducing an extract element from a shuffle.
18570 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18571 DAG.getIntPtrConstant(SplatIdx));
18572 }
18573 }
18574
18575 if (BaseShAmt.getNode()) {
18576 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18576, __PRETTY_FUNCTION__))
;
18577 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18578 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18579 else if (EltVT.bitsLT(MVT::i32))
18580 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18581
18582 switch (Op.getOpcode()) {
18583 default:
18584 llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18584)
;
18585 case ISD::SHL:
18586 switch (VT.SimpleTy) {
18587 default: return SDValue();
18588 case MVT::v2i64:
18589 case MVT::v4i32:
18590 case MVT::v8i16:
18591 case MVT::v4i64:
18592 case MVT::v8i32:
18593 case MVT::v16i16:
18594 case MVT::v16i32:
18595 case MVT::v8i64:
18596 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18597 }
18598 case ISD::SRA:
18599 switch (VT.SimpleTy) {
18600 default: return SDValue();
18601 case MVT::v4i32:
18602 case MVT::v8i16:
18603 case MVT::v8i32:
18604 case MVT::v16i16:
18605 case MVT::v16i32:
18606 case MVT::v8i64:
18607 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18608 }
18609 case ISD::SRL:
18610 switch (VT.SimpleTy) {
18611 default: return SDValue();
18612 case MVT::v2i64:
18613 case MVT::v4i32:
18614 case MVT::v8i16:
18615 case MVT::v4i64:
18616 case MVT::v8i32:
18617 case MVT::v16i16:
18618 case MVT::v16i32:
18619 case MVT::v8i64:
18620 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18621 }
18622 }
18623 }
18624 }
18625
18626 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18627 if (!Subtarget->is64Bit() &&
18628 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18629 (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18630 Amt.getOpcode() == ISD::BITCAST &&
18631 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18632 Amt = Amt.getOperand(0);
18633 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18634 VT.getVectorNumElements();
18635 std::vector<SDValue> Vals(Ratio);
18636 for (unsigned i = 0; i != Ratio; ++i)
18637 Vals[i] = Amt.getOperand(i);
18638 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18639 for (unsigned j = 0; j != Ratio; ++j)
18640 if (Vals[j] != Amt.getOperand(i + j))
18641 return SDValue();
18642 }
18643 switch (Op.getOpcode()) {
18644 default:
18645 llvm_unreachable("Unknown shift opcode!")::llvm::llvm_unreachable_internal("Unknown shift opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18645)
;
18646 case ISD::SHL:
18647 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18648 case ISD::SRL:
18649 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18650 case ISD::SRA:
18651 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18652 }
18653 }
18654
18655 return SDValue();
18656}
18657
18658static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18659 SelectionDAG &DAG) {
18660 MVT VT = Op.getSimpleValueType();
18661 SDLoc dl(Op);
18662 SDValue R = Op.getOperand(0);
18663 SDValue Amt = Op.getOperand(1);
18664 SDValue V;
18665
18666 assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18666, __PRETTY_FUNCTION__))
;
18667 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18667, __PRETTY_FUNCTION__))
;
18668
18669 V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18670 if (V.getNode())
18671 return V;
18672
18673 V = LowerScalarVariableShift(Op, DAG, Subtarget);
18674 if (V.getNode())
18675 return V;
18676
18677 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18678 return Op;
18679 // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18680 if (Subtarget->hasInt256()) {
18681 if (Op.getOpcode() == ISD::SRL &&
18682 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18683 VT == MVT::v4i64 || VT == MVT::v8i32))
18684 return Op;
18685 if (Op.getOpcode() == ISD::SHL &&
18686 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18687 VT == MVT::v4i64 || VT == MVT::v8i32))
18688 return Op;
18689 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18690 return Op;
18691 }
18692
18693 // If possible, lower this packed shift into a vector multiply instead of
18694 // expanding it into a sequence of scalar shifts.
18695 // Do this only if the vector shift count is a constant build_vector.
18696 if (Op.getOpcode() == ISD::SHL &&
18697 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18698 (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18699 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18700 SmallVector<SDValue, 8> Elts;
18701 EVT SVT = VT.getScalarType();
18702 unsigned SVTBits = SVT.getSizeInBits();
18703 const APInt &One = APInt(SVTBits, 1);
18704 unsigned NumElems = VT.getVectorNumElements();
18705
18706 for (unsigned i=0; i !=NumElems; ++i) {
18707 SDValue Op = Amt->getOperand(i);
18708 if (Op->getOpcode() == ISD::UNDEF) {
18709 Elts.push_back(Op);
18710 continue;
18711 }
18712
18713 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18714 const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18715 uint64_t ShAmt = C.getZExtValue();
18716 if (ShAmt >= SVTBits) {
18717 Elts.push_back(DAG.getUNDEF(SVT));
18718 continue;
18719 }
18720 Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18721 }
18722 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18723 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18724 }
18725
18726 // Lower SHL with variable shift amount.
18727 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18728 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18729
18730 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18731 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18732 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18733 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18734 }
18735
18736 // If possible, lower this shift as a sequence of two shifts by
18737 // constant plus a MOVSS/MOVSD instead of scalarizing it.
18738 // Example:
18739 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18740 //
18741 // Could be rewritten as:
18742 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18743 //
18744 // The advantage is that the two shifts from the example would be
18745 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18746 // the vector shift into four scalar shifts plus four pairs of vector
18747 // insert/extract.
18748 if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18749 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18750 unsigned TargetOpcode = X86ISD::MOVSS;
18751 bool CanBeSimplified;
18752 // The splat value for the first packed shift (the 'X' from the example).
18753 SDValue Amt1 = Amt->getOperand(0);
18754 // The splat value for the second packed shift (the 'Y' from the example).
18755 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18756 Amt->getOperand(2);
18757
18758 // See if it is possible to replace this node with a sequence of
18759 // two shifts followed by a MOVSS/MOVSD
18760 if (VT == MVT::v4i32) {
18761 // Check if it is legal to use a MOVSS.
18762 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18763 Amt2 == Amt->getOperand(3);
18764 if (!CanBeSimplified) {
18765 // Otherwise, check if we can still simplify this node using a MOVSD.
18766 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18767 Amt->getOperand(2) == Amt->getOperand(3);
18768 TargetOpcode = X86ISD::MOVSD;
18769 Amt2 = Amt->getOperand(2);
18770 }
18771 } else {
18772 // Do similar checks for the case where the machine value type
18773 // is MVT::v8i16.
18774 CanBeSimplified = Amt1 == Amt->getOperand(1);
18775 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18776 CanBeSimplified = Amt2 == Amt->getOperand(i);
18777
18778 if (!CanBeSimplified) {
18779 TargetOpcode = X86ISD::MOVSD;
18780 CanBeSimplified = true;
18781 Amt2 = Amt->getOperand(4);
18782 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
18783 CanBeSimplified = Amt1 == Amt->getOperand(i);
18784 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
18785 CanBeSimplified = Amt2 == Amt->getOperand(j);
18786 }
18787 }
18788
18789 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
18790 isa<ConstantSDNode>(Amt2)) {
18791 // Replace this node with two shifts followed by a MOVSS/MOVSD.
18792 EVT CastVT = MVT::v4i32;
18793 SDValue Splat1 =
18794 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
18795 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
18796 SDValue Splat2 =
18797 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
18798 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
18799 if (TargetOpcode == X86ISD::MOVSD)
18800 CastVT = MVT::v2i64;
18801 SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
18802 SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
18803 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
18804 BitCast1, DAG);
18805 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
18806 }
18807 }
18808
18809 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
18810 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.")((Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Need SSE2 for pslli/pcmpeq.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18810, __PRETTY_FUNCTION__))
;
18811
18812 // a = a << 5;
18813 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
18814 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
18815
18816 // Turn 'a' into a mask suitable for VSELECT
18817 SDValue VSelM = DAG.getConstant(0x80, VT);
18818 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18819 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18820
18821 SDValue CM1 = DAG.getConstant(0x0f, VT);
18822 SDValue CM2 = DAG.getConstant(0x3f, VT);
18823
18824 // r = VSELECT(r, psllw(r & (char16)15, 4), a);
18825 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
18826 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
18827 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18828 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18829
18830 // a += a
18831 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18832 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18833 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18834
18835 // r = VSELECT(r, psllw(r & (char16)63, 2), a);
18836 M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
18837 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
18838 M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18839 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18840
18841 // a += a
18842 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18843 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18844 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18845
18846 // return VSELECT(r, r+r, a);
18847 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
18848 DAG.getNode(ISD::ADD, dl, VT, R, R), R);
18849 return R;
18850 }
18851
18852 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
18853 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
18854 // solution better.
18855 if (Subtarget->hasInt256() && VT == MVT::v8i16) {
18856 MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
18857 unsigned ExtOpc =
18858 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18859 R = DAG.getNode(ExtOpc, dl, NewVT, R);
18860 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
18861 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18862 DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
18863 }
18864
18865 // Decompose 256-bit shifts into smaller 128-bit shifts.
18866 if (VT.is256BitVector()) {
18867 unsigned NumElems = VT.getVectorNumElements();
18868 MVT EltVT = VT.getVectorElementType();
18869 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18870
18871 // Extract the two vectors
18872 SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
18873 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
18874
18875 // Recreate the shift amount vectors
18876 SDValue Amt1, Amt2;
18877 if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
18878 // Constant shift amount
18879 SmallVector<SDValue, 4> Amt1Csts;
18880 SmallVector<SDValue, 4> Amt2Csts;
18881 for (unsigned i = 0; i != NumElems/2; ++i)
18882 Amt1Csts.push_back(Amt->getOperand(i));
18883 for (unsigned i = NumElems/2; i != NumElems; ++i)
18884 Amt2Csts.push_back(Amt->getOperand(i));
18885
18886 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
18887 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
18888 } else {
18889 // Variable shift amount
18890 Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
18891 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
18892 }
18893
18894 // Issue new vector shifts for the smaller types
18895 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
18896 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
18897
18898 // Concatenate the result back
18899 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
18900 }
18901
18902 return SDValue();
18903}
18904
18905static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
18906 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
18907 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
18908 // looks for this combo and may remove the "setcc" instruction if the "setcc"
18909 // has only one use.
18910 SDNode *N = Op.getNode();
18911 SDValue LHS = N->getOperand(0);
18912 SDValue RHS = N->getOperand(1);
18913 unsigned BaseOp = 0;
18914 unsigned Cond = 0;
18915 SDLoc DL(Op);
18916 switch (Op.getOpcode()) {
18917 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 18917)
;
18918 case ISD::SADDO:
18919 // A subtract of one will be selected as a INC. Note that INC doesn't
18920 // set CF, so we can't do this for UADDO.
18921 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18922 if (C->isOne()) {
18923 BaseOp = X86ISD::INC;
18924 Cond = X86::COND_O;
18925 break;
18926 }
18927 BaseOp = X86ISD::ADD;
18928 Cond = X86::COND_O;
18929 break;
18930 case ISD::UADDO:
18931 BaseOp = X86ISD::ADD;
18932 Cond = X86::COND_B;
18933 break;
18934 case ISD::SSUBO:
18935 // A subtract of one will be selected as a DEC. Note that DEC doesn't
18936 // set CF, so we can't do this for USUBO.
18937 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18938 if (C->isOne()) {
18939 BaseOp = X86ISD::DEC;
18940 Cond = X86::COND_O;
18941 break;
18942 }
18943 BaseOp = X86ISD::SUB;
18944 Cond = X86::COND_O;
18945 break;
18946 case ISD::USUBO:
18947 BaseOp = X86ISD::SUB;
18948 Cond = X86::COND_B;
18949 break;
18950 case ISD::SMULO:
18951 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
18952 Cond = X86::COND_O;
18953 break;
18954 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
18955 if (N->getValueType(0) == MVT::i8) {
18956 BaseOp = X86ISD::UMUL8;
18957 Cond = X86::COND_O;
18958 break;
18959 }
18960 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
18961 MVT::i32);
18962 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
18963
18964 SDValue SetCC =
18965 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
18966 DAG.getConstant(X86::COND_O, MVT::i32),
18967 SDValue(Sum.getNode(), 2));
18968
18969 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
18970 }
18971 }
18972
18973 // Also sets EFLAGS.
18974 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
18975 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
18976
18977 SDValue SetCC =
18978 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
18979 DAG.getConstant(Cond, MVT::i32),
18980 SDValue(Sum.getNode(), 1));
18981
18982 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
18983}
18984
18985// Sign extension of the low part of vector elements. This may be used either
18986// when sign extend instructions are not available or if the vector element
18987// sizes already match the sign-extended size. If the vector elements are in
18988// their pre-extended size and sign extend instructions are available, that will
18989// be handled by LowerSIGN_EXTEND.
18990SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
18991 SelectionDAG &DAG) const {
18992 SDLoc dl(Op);
18993 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
18994 MVT VT = Op.getSimpleValueType();
18995
18996 if (!Subtarget->hasSSE2() || !VT.isVector())
18997 return SDValue();
18998
18999 unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19000 ExtraVT.getScalarType().getSizeInBits();
19001
19002 switch (VT.SimpleTy) {
19003 default: return SDValue();
19004 case MVT::v8i32:
19005 case MVT::v16i16:
19006 if (!Subtarget->hasFp256())
19007 return SDValue();
19008 if (!Subtarget->hasInt256()) {
19009 // needs to be split
19010 unsigned NumElems = VT.getVectorNumElements();
19011
19012 // Extract the LHS vectors
19013 SDValue LHS = Op.getOperand(0);
19014 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19015 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19016
19017 MVT EltVT = VT.getVectorElementType();
19018 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19019
19020 EVT ExtraEltVT = ExtraVT.getVectorElementType();
19021 unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19022 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19023 ExtraNumElems/2);
19024 SDValue Extra = DAG.getValueType(ExtraVT);
19025
19026 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19027 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19028
19029 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19030 }
19031 // fall through
19032 case MVT::v4i32:
19033 case MVT::v8i16: {
19034 SDValue Op0 = Op.getOperand(0);
19035
19036 // This is a sign extension of some low part of vector elements without
19037 // changing the size of the vector elements themselves:
19038 // Shift-Left + Shift-Right-Algebraic.
19039 SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19040 BitsDiff, DAG);
19041 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19042 DAG);
19043 }
19044 }
19045}
19046
19047/// Returns true if the operand type is exactly twice the native width, and
19048/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19049/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19050/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19051bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19052 const X86Subtarget &Subtarget =
19053 getTargetMachine().getSubtarget<X86Subtarget>();
19054 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19055
19056 if (OpWidth == 64)
19057 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19058 else if (OpWidth == 128)
19059 return Subtarget.hasCmpxchg16b();
19060 else
19061 return false;
19062}
19063
19064bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19065 return needsCmpXchgNb(SI->getValueOperand()->getType());
19066}
19067
19068// Note: this turns large loads into lock cmpxchg8b/16b.
19069// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19070bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19071 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19072 return needsCmpXchgNb(PTy->getElementType());
19073}
19074
19075bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19076 const X86Subtarget &Subtarget =
19077 getTargetMachine().getSubtarget<X86Subtarget>();
19078 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19079 const Type *MemType = AI->getType();
19080
19081 // If the operand is too big, we must see if cmpxchg8/16b is available
19082 // and default to library calls otherwise.
19083 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19084 return needsCmpXchgNb(MemType);
19085
19086 AtomicRMWInst::BinOp Op = AI->getOperation();
19087 switch (Op) {
19088 default:
19089 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19089)
;
19090 case AtomicRMWInst::Xchg:
19091 case AtomicRMWInst::Add:
19092 case AtomicRMWInst::Sub:
19093 // It's better to use xadd, xsub or xchg for these in all cases.
19094 return false;
19095 case AtomicRMWInst::Or:
19096 case AtomicRMWInst::And:
19097 case AtomicRMWInst::Xor:
19098 // If the atomicrmw's result isn't actually used, we can just add a "lock"
19099 // prefix to a normal instruction for these operations.
19100 return !AI->use_empty();
19101 case AtomicRMWInst::Nand:
19102 case AtomicRMWInst::Max:
19103 case AtomicRMWInst::Min:
19104 case AtomicRMWInst::UMax:
19105 case AtomicRMWInst::UMin:
19106 // These always require a non-trivial set of data operations on x86. We must
19107 // use a cmpxchg loop.
19108 return true;
19109 }
19110}
19111
19112static bool hasMFENCE(const X86Subtarget& Subtarget) {
19113 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19114 // no-sse2). There isn't any reason to disable it if the target processor
19115 // supports it.
19116 return Subtarget.hasSSE2() || Subtarget.is64Bit();
19117}
19118
19119LoadInst *
19120X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19121 const X86Subtarget &Subtarget =
19122 getTargetMachine().getSubtarget<X86Subtarget>();
19123 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19124 const Type *MemType = AI->getType();
19125 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19126 // there is no benefit in turning such RMWs into loads, and it is actually
19127 // harmful as it introduces a mfence.
19128 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19129 return nullptr;
19130
19131 auto Builder = IRBuilder<>(AI);
19132 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19133 auto SynchScope = AI->getSynchScope();
19134 // We must restrict the ordering to avoid generating loads with Release or
19135 // ReleaseAcquire orderings.
19136 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19137 auto Ptr = AI->getPointerOperand();
19138
19139 // Before the load we need a fence. Here is an example lifted from
19140 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19141 // is required:
19142 // Thread 0:
19143 // x.store(1, relaxed);
19144 // r1 = y.fetch_add(0, release);
19145 // Thread 1:
19146 // y.fetch_add(42, acquire);
19147 // r2 = x.load(relaxed);
19148 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19149 // lowered to just a load without a fence. A mfence flushes the store buffer,
19150 // making the optimization clearly correct.
19151 // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19152 // otherwise, we might be able to be more agressive on relaxed idempotent
19153 // rmw. In practice, they do not look useful, so we don't try to be
19154 // especially clever.
19155 if (SynchScope == SingleThread) {
19156 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19157 // the IR level, so we must wrap it in an intrinsic.
19158 return nullptr;
19159 } else if (hasMFENCE(Subtarget)) {
19160 Function *MFence = llvm::Intrinsic::getDeclaration(M,
19161 Intrinsic::x86_sse2_mfence);
19162 Builder.CreateCall(MFence);
19163 } else {
19164 // FIXME: it might make sense to use a locked operation here but on a
19165 // different cache-line to prevent cache-line bouncing. In practice it
19166 // is probably a small win, and x86 processors without mfence are rare
19167 // enough that we do not bother.
19168 return nullptr;
19169 }
19170
19171 // Finally we can emit the atomic load.
19172 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19173 AI->getType()->getPrimitiveSizeInBits());
19174 Loaded->setAtomic(Order, SynchScope);
19175 AI->replaceAllUsesWith(Loaded);
19176 AI->eraseFromParent();
19177 return Loaded;
19178}
19179
19180static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19181 SelectionDAG &DAG) {
19182 SDLoc dl(Op);
19183 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19184 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19185 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19186 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19187
19188 // The only fence that needs an instruction is a sequentially-consistent
19189 // cross-thread fence.
19190 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19191 if (hasMFENCE(*Subtarget))
19192 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19193
19194 SDValue Chain = Op.getOperand(0);
19195 SDValue Zero = DAG.getConstant(0, MVT::i32);
19196 SDValue Ops[] = {
19197 DAG.getRegister(X86::ESP, MVT::i32), // Base
19198 DAG.getTargetConstant(1, MVT::i8), // Scale
19199 DAG.getRegister(0, MVT::i32), // Index
19200 DAG.getTargetConstant(0, MVT::i32), // Disp
19201 DAG.getRegister(0, MVT::i32), // Segment.
19202 Zero,
19203 Chain
19204 };
19205 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19206 return SDValue(Res, 0);
19207 }
19208
19209 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19210 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19211}
19212
19213static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19214 SelectionDAG &DAG) {
19215 MVT T = Op.getSimpleValueType();
19216 SDLoc DL(Op);
19217 unsigned Reg = 0;
19218 unsigned size = 0;
19219 switch(T.SimpleTy) {
19220 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19220)
;
19221 case MVT::i8: Reg = X86::AL; size = 1; break;
19222 case MVT::i16: Reg = X86::AX; size = 2; break;
19223 case MVT::i32: Reg = X86::EAX; size = 4; break;
19224 case MVT::i64:
19225 assert(Subtarget->is64Bit() && "Node not type legal!")((Subtarget->is64Bit() && "Node not type legal!") ?
static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && \"Node not type legal!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19225, __PRETTY_FUNCTION__))
;
19226 Reg = X86::RAX; size = 8;
19227 break;
19228 }
19229 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19230 Op.getOperand(2), SDValue());
19231 SDValue Ops[] = { cpIn.getValue(0),
19232 Op.getOperand(1),
19233 Op.getOperand(3),
19234 DAG.getTargetConstant(size, MVT::i8),
19235 cpIn.getValue(1) };
19236 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19237 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19238 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19239 Ops, T, MMO);
19240
19241 SDValue cpOut =
19242 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19243 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19244 MVT::i32, cpOut.getValue(2));
19245 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19246 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19247
19248 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19249 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19250 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19251 return SDValue();
19252}
19253
19254static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19255 SelectionDAG &DAG) {
19256 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19257 MVT DstVT = Op.getSimpleValueType();
19258
19259 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19260 assert(Subtarget->hasSSE2() && "Requires at least SSE2!")((Subtarget->hasSSE2() && "Requires at least SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Requires at least SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19260, __PRETTY_FUNCTION__))
;
19261 if (DstVT != MVT::f64)
19262 // This conversion needs to be expanded.
19263 return SDValue();
19264
19265 SDValue InVec = Op->getOperand(0);
19266 SDLoc dl(Op);
19267 unsigned NumElts = SrcVT.getVectorNumElements();
19268 EVT SVT = SrcVT.getVectorElementType();
19269
19270 // Widen the vector in input in the case of MVT::v2i32.
19271 // Example: from MVT::v2i32 to MVT::v4i32.
19272 SmallVector<SDValue, 16> Elts;
19273 for (unsigned i = 0, e = NumElts; i != e; ++i)
19274 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19275 DAG.getIntPtrConstant(i)));
19276
19277 // Explicitly mark the extra elements as Undef.
19278 SDValue Undef = DAG.getUNDEF(SVT);
19279 for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19280 Elts.push_back(Undef);
19281
19282 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19283 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19284 SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19285 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19286 DAG.getIntPtrConstant(0));
19287 }
19288
19289 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&((Subtarget->is64Bit() && !Subtarget->hasSSE2()
&& Subtarget->hasMMX() && "Unexpected custom BITCAST"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19290, __PRETTY_FUNCTION__))
19290 Subtarget->hasMMX() && "Unexpected custom BITCAST")((Subtarget->is64Bit() && !Subtarget->hasSSE2()
&& Subtarget->hasMMX() && "Unexpected custom BITCAST"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19290, __PRETTY_FUNCTION__))
;
19291 assert((DstVT == MVT::i64 ||(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__))
19292 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__))
19293 "Unexpected custom BITCAST")(((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits
()==64)) && "Unexpected custom BITCAST") ? static_cast
<void> (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19293, __PRETTY_FUNCTION__))
;
19294 // i64 <=> MMX conversions are Legal.
19295 if (SrcVT==MVT::i64 && DstVT.isVector())
19296 return Op;
19297 if (DstVT==MVT::i64 && SrcVT.isVector())
19298 return Op;
19299 // MMX <=> MMX conversions are Legal.
19300 if (SrcVT.isVector() && DstVT.isVector())
19301 return Op;
19302 // All other conversions need to be expanded.
19303 return SDValue();
19304}
19305
19306static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19307 SelectionDAG &DAG) {
19308 SDNode *Node = Op.getNode();
19309 SDLoc dl(Node);
19310
19311 Op = Op.getOperand(0);
19312 EVT VT = Op.getValueType();
19313 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "CTPOP lowering only implemented for 128/256-bit wide vector types"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"CTPOP lowering only implemented for 128/256-bit wide vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19314, __PRETTY_FUNCTION__))
19314 "CTPOP lowering only implemented for 128/256-bit wide vector types")(((VT.is128BitVector() || VT.is256BitVector()) && "CTPOP lowering only implemented for 128/256-bit wide vector types"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"CTPOP lowering only implemented for 128/256-bit wide vector types\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19314, __PRETTY_FUNCTION__))
;
19315
19316 unsigned NumElts = VT.getVectorNumElements();
19317 EVT EltVT = VT.getVectorElementType();
19318 unsigned Len = EltVT.getSizeInBits();
19319
19320 // This is the vectorized version of the "best" algorithm from
19321 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19322 // with a minor tweak to use a series of adds + shifts instead of vector
19323 // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19324 //
19325 // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19326 // v8i32 => Always profitable
19327 //
19328 // FIXME: There a couple of possible improvements:
19329 //
19330 // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19331 // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19332 //
19333 assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&((EltVT.isInteger() && (Len == 32 || Len == 64) &&
Len % 8 == 0 && "CTPOP not implemented for this vector element type."
) ? static_cast<void> (0) : __assert_fail ("EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && \"CTPOP not implemented for this vector element type.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19334, __PRETTY_FUNCTION__))
19334 "CTPOP not implemented for this vector element type.")((EltVT.isInteger() && (Len == 32 || Len == 64) &&
Len % 8 == 0 && "CTPOP not implemented for this vector element type."
) ? static_cast<void> (0) : __assert_fail ("EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && \"CTPOP not implemented for this vector element type.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19334, __PRETTY_FUNCTION__))
;
19335
19336 // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19337 // extra legalization.
19338 bool NeedsBitcast = EltVT == MVT::i32;
19339 MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19340
19341 SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19342 SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19343 SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19344
19345 // v = v - ((v >> 1) & 0x55555555...)
19346 SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19347 SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19348 SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19349 if (NeedsBitcast)
19350 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19351
19352 SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19353 SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19354 if (NeedsBitcast)
19355 M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19356
19357 SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19358 if (VT != And.getValueType())
19359 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19360 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19361
19362 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19363 SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19364 SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19365 SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19366 SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19367
19368 Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19369 if (NeedsBitcast) {
19370 Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19371 M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19372 Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19373 }
19374
19375 SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19376 SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19377 if (VT != AndRHS.getValueType()) {
19378 AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19379 AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19380 }
19381 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19382
19383 // v = (v + (v >> 4)) & 0x0F0F0F0F...
19384 SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19385 SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19386 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19387 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19388
19389 SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19390 SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19391 if (NeedsBitcast) {
19392 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19393 M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19394 }
19395 And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19396 if (VT != And.getValueType())
19397 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19398
19399 // The algorithm mentioned above uses:
19400 // v = (v * 0x01010101...) >> (Len - 8)
19401 //
19402 // Change it to use vector adds + vector shifts which yield faster results on
19403 // Haswell than using vector integer multiplication.
19404 //
19405 // For i32 elements:
19406 // v = v + (v >> 8)
19407 // v = v + (v >> 16)
19408 //
19409 // For i64 elements:
19410 // v = v + (v >> 8)
19411 // v = v + (v >> 16)
19412 // v = v + (v >> 32)
19413 //
19414 Add = And;
19415 SmallVector<SDValue, 8> Csts;
19416 for (unsigned i = 8; i <= Len/2; i *= 2) {
19417 Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19418 SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19419 Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19420 Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19421 Csts.clear();
19422 }
19423
19424 // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19425 SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19426 SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19427 SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19428 if (NeedsBitcast) {
19429 Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19430 M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19431 }
19432 And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19433 if (VT != And.getValueType())
19434 And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19435
19436 return And;
19437}
19438
19439static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19440 SDNode *Node = Op.getNode();
19441 SDLoc dl(Node);
19442 EVT T = Node->getValueType(0);
19443 SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19444 DAG.getConstant(0, T), Node->getOperand(2));
19445 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19446 cast<AtomicSDNode>(Node)->getMemoryVT(),
19447 Node->getOperand(0),
19448 Node->getOperand(1), negOp,
19449 cast<AtomicSDNode>(Node)->getMemOperand(),
19450 cast<AtomicSDNode>(Node)->getOrdering(),
19451 cast<AtomicSDNode>(Node)->getSynchScope());
19452}
19453
19454static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19455 SDNode *Node = Op.getNode();
19456 SDLoc dl(Node);
19457 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19458
19459 // Convert seq_cst store -> xchg
19460 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19461 // FIXME: On 32-bit, store -> fist or movq would be more efficient
19462 // (The only way to get a 16-byte store is cmpxchg16b)
19463 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19464 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19465 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19466 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19467 cast<AtomicSDNode>(Node)->getMemoryVT(),
19468 Node->getOperand(0),
19469 Node->getOperand(1), Node->getOperand(2),
19470 cast<AtomicSDNode>(Node)->getMemOperand(),
19471 cast<AtomicSDNode>(Node)->getOrdering(),
19472 cast<AtomicSDNode>(Node)->getSynchScope());
19473 return Swap.getValue(1);
19474 }
19475 // Other atomic stores have a simple pattern.
19476 return Op;
19477}
19478
19479static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19480 EVT VT = Op.getNode()->getSimpleValueType(0);
19481
19482 // Let legalize expand this if it isn't a legal type yet.
19483 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19484 return SDValue();
19485
19486 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19487
19488 unsigned Opc;
19489 bool ExtraOp = false;
19490 switch (Op.getOpcode()) {
19491 default: llvm_unreachable("Invalid code")::llvm::llvm_unreachable_internal("Invalid code", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19491)
;
19492 case ISD::ADDC: Opc = X86ISD::ADD; break;
19493 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19494 case ISD::SUBC: Opc = X86ISD::SUB; break;
19495 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19496 }
19497
19498 if (!ExtraOp)
19499 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19500 Op.getOperand(1));
19501 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19502 Op.getOperand(1), Op.getOperand(2));
19503}
19504
19505static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19506 SelectionDAG &DAG) {
19507 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit())((Subtarget->isTargetDarwin() && Subtarget->is64Bit
()) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetDarwin() && Subtarget->is64Bit()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19507, __PRETTY_FUNCTION__))
;
19508
19509 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19510 // which returns the values as { float, float } (in XMM0) or
19511 // { double, double } (which is returned in XMM0, XMM1).
19512 SDLoc dl(Op);
19513 SDValue Arg = Op.getOperand(0);
19514 EVT ArgVT = Arg.getValueType();
19515 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19516
19517 TargetLowering::ArgListTy Args;
19518 TargetLowering::ArgListEntry Entry;
19519
19520 Entry.Node = Arg;
19521 Entry.Ty = ArgTy;
19522 Entry.isSExt = false;
19523 Entry.isZExt = false;
19524 Args.push_back(Entry);
19525
19526 bool isF64 = ArgVT == MVT::f64;
19527 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19528 // the small struct {f32, f32} is returned in (eax, edx). For f64,
19529 // the results are returned via SRet in memory.
19530 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
19531 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19532 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19533
19534 Type *RetTy = isF64
19535 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19536 : (Type*)VectorType::get(ArgTy, 4);
19537
19538 TargetLowering::CallLoweringInfo CLI(DAG);
19539 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19540 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19541
19542 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19543
19544 if (isF64)
19545 // Returned in xmm0 and xmm1.
19546 return CallResult.first;
19547
19548 // Returned in bits 0:31 and 32:64 xmm0.
19549 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19550 CallResult.first, DAG.getIntPtrConstant(0));
19551 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19552 CallResult.first, DAG.getIntPtrConstant(1));
19553 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19554 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19555}
19556
19557/// LowerOperation - Provide custom lowering hooks for some operations.
19558///
19559SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19560 switch (Op.getOpcode()) {
19561 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19561)
;
19562 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
19563 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19564 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19565 return LowerCMP_SWAP(Op, Subtarget, DAG);
19566 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
19567 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
19568 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
19569 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
19570 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
19571 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
19572 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
19573 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19574 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
19575 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19576 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19577 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
19578 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
19579 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
19580 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
19581 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
19582 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
19583 case ISD::SHL_PARTS:
19584 case ISD::SRA_PARTS:
19585 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
19586 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
19587 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
19588 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
19589 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
19590 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19591 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
19592 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
19593 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
19594 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
19595 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
19596 case ISD::FABS:
19597 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
19598 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
19599 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
19600 case ISD::SETCC: return LowerSETCC(Op, DAG);
19601 case ISD::SELECT: return LowerSELECT(Op, DAG);
19602 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
19603 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
19604 case ISD::VASTART: return LowerVASTART(Op, DAG);
19605 case ISD::VAARG: return LowerVAARG(Op, DAG);
19606 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
19607 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19608 case ISD::INTRINSIC_VOID:
19609 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19610 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
19611 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
19612 case ISD::FRAME_TO_ARGS_OFFSET:
19613 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19614 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19615 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
19616 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
19617 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
19618 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
19619 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
19620 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
19621 case ISD::CTLZ: return LowerCTLZ(Op, DAG);
19622 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19623 case ISD::CTTZ: return LowerCTTZ(Op, DAG);
19624 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
19625 case ISD::UMUL_LOHI:
19626 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
19627 case ISD::SRA:
19628 case ISD::SRL:
19629 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
19630 case ISD::SADDO:
19631 case ISD::UADDO:
19632 case ISD::SSUBO:
19633 case ISD::USUBO:
19634 case ISD::SMULO:
19635 case ISD::UMULO: return LowerXALUO(Op, DAG);
19636 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19637 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
19638 case ISD::ADDC:
19639 case ISD::ADDE:
19640 case ISD::SUBC:
19641 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19642 case ISD::ADD: return LowerADD(Op, DAG);
19643 case ISD::SUB: return LowerSUB(Op, DAG);
19644 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
19645 }
19646}
19647
19648/// ReplaceNodeResults - Replace a node with an illegal result type
19649/// with a new node built out of custom code.
19650void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19651 SmallVectorImpl<SDValue>&Results,
19652 SelectionDAG &DAG) const {
19653 SDLoc dl(N);
19654 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19655 switch (N->getOpcode()) {
19656 default:
19657 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19657)
;
19658 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19659 case X86ISD::FMINC:
19660 case X86ISD::FMIN:
19661 case X86ISD::FMAXC:
19662 case X86ISD::FMAX: {
19663 EVT VT = N->getValueType(0);
19664 if (VT != MVT::v2f32)
19665 llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.")::llvm::llvm_unreachable_internal("Unexpected type (!= v2f32) on FMIN/FMAX."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19665)
;
19666 SDValue UNDEF = DAG.getUNDEF(VT);
19667 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19668 N->getOperand(0), UNDEF);
19669 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19670 N->getOperand(1), UNDEF);
19671 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19672 return;
19673 }
19674 case ISD::SIGN_EXTEND_INREG:
19675 case ISD::ADDC:
19676 case ISD::ADDE:
19677 case ISD::SUBC:
19678 case ISD::SUBE:
19679 // We don't want to expand or promote these.
19680 return;
19681 case ISD::SDIV:
19682 case ISD::UDIV:
19683 case ISD::SREM:
19684 case ISD::UREM:
19685 case ISD::SDIVREM:
19686 case ISD::UDIVREM: {
19687 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19688 Results.push_back(V);
19689 return;
19690 }
19691 case ISD::FP_TO_SINT:
19692 case ISD::FP_TO_UINT: {
19693 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19694
19695 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19696 return;
19697
19698 std::pair<SDValue,SDValue> Vals =
19699 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19700 SDValue FIST = Vals.first, StackSlot = Vals.second;
19701 if (FIST.getNode()) {
19702 EVT VT = N->getValueType(0);
19703 // Return a load from the stack slot.
19704 if (StackSlot.getNode())
19705 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19706 MachinePointerInfo(),
19707 false, false, false, 0));
19708 else
19709 Results.push_back(FIST);
19710 }
19711 return;
19712 }
19713 case ISD::UINT_TO_FP: {
19714 assert(Subtarget->hasSSE2() && "Requires at least SSE2!")((Subtarget->hasSSE2() && "Requires at least SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Requires at least SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19714, __PRETTY_FUNCTION__))
;
19715 if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19716 N->getValueType(0) != MVT::v2f32)
19717 return;
19718 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19719 N->getOperand(0));
19720 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19721 MVT::f64);
19722 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19723 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19724 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19725 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19726 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19727 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19728 return;
19729 }
19730 case ISD::FP_ROUND: {
19731 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19732 return;
19733 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19734 Results.push_back(V);
19735 return;
19736 }
19737 case ISD::INTRINSIC_W_CHAIN: {
19738 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19739 switch (IntNo) {
19740 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19741)
19741 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19741)
;
19742 case Intrinsic::x86_rdtsc:
19743 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19744 Results);
19745 case Intrinsic::x86_rdtscp:
19746 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19747 Results);
19748 case Intrinsic::x86_rdpmc:
19749 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19750 }
19751 }
19752 case ISD::READCYCLECOUNTER: {
19753 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19754 Results);
19755 }
19756 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19757 EVT T = N->getValueType(0);
19758 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19758, __PRETTY_FUNCTION__))
;
19759 bool Regs64bit = T == MVT::i128;
19760 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19761 SDValue cpInL, cpInH;
19762 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19763 DAG.getConstant(0, HalfT));
19764 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19765 DAG.getConstant(1, HalfT));
19766 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19767 Regs64bit ? X86::RAX : X86::EAX,
19768 cpInL, SDValue());
19769 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19770 Regs64bit ? X86::RDX : X86::EDX,
19771 cpInH, cpInL.getValue(1));
19772 SDValue swapInL, swapInH;
19773 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19774 DAG.getConstant(0, HalfT));
19775 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19776 DAG.getConstant(1, HalfT));
19777 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19778 Regs64bit ? X86::RBX : X86::EBX,
19779 swapInL, cpInH.getValue(1));
19780 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19781 Regs64bit ? X86::RCX : X86::ECX,
19782 swapInH, swapInL.getValue(1));
19783 SDValue Ops[] = { swapInH.getValue(0),
19784 N->getOperand(1),
19785 swapInH.getValue(1) };
19786 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19787 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
19788 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
19789 X86ISD::LCMPXCHG8_DAG;
19790 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
19791 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
19792 Regs64bit ? X86::RAX : X86::EAX,
19793 HalfT, Result.getValue(1));
19794 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
19795 Regs64bit ? X86::RDX : X86::EDX,
19796 HalfT, cpOutL.getValue(2));
19797 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
19798
19799 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
19800 MVT::i32, cpOutH.getValue(2));
19801 SDValue Success =
19802 DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19803 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19804 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
19805
19806 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
19807 Results.push_back(Success);
19808 Results.push_back(EFLAGS.getValue(1));
19809 return;
19810 }
19811 case ISD::ATOMIC_SWAP:
19812 case ISD::ATOMIC_LOAD_ADD:
19813 case ISD::ATOMIC_LOAD_SUB:
19814 case ISD::ATOMIC_LOAD_AND:
19815 case ISD::ATOMIC_LOAD_OR:
19816 case ISD::ATOMIC_LOAD_XOR:
19817 case ISD::ATOMIC_LOAD_NAND:
19818 case ISD::ATOMIC_LOAD_MIN:
19819 case ISD::ATOMIC_LOAD_MAX:
19820 case ISD::ATOMIC_LOAD_UMIN:
19821 case ISD::ATOMIC_LOAD_UMAX:
19822 case ISD::ATOMIC_LOAD: {
19823 // Delegate to generic TypeLegalization. Situations we can really handle
19824 // should have already been dealt with by AtomicExpandPass.cpp.
19825 break;
19826 }
19827 case ISD::BITCAST: {
19828 assert(Subtarget->hasSSE2() && "Requires at least SSE2!")((Subtarget->hasSSE2() && "Requires at least SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE2() && \"Requires at least SSE2!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 19828, __PRETTY_FUNCTION__))
;
19829 EVT DstVT = N->getValueType(0);
19830 EVT SrcVT = N->getOperand(0)->getValueType(0);
19831
19832 if (SrcVT != MVT::f64 ||
19833 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
19834 return;
19835
19836 unsigned NumElts = DstVT.getVectorNumElements();
19837 EVT SVT = DstVT.getVectorElementType();
19838 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19839 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
19840 MVT::v2f64, N->getOperand(0));
19841 SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
19842
19843 if (ExperimentalVectorWideningLegalization) {
19844 // If we are legalizing vectors by widening, we already have the desired
19845 // legal vector type, just return it.
19846 Results.push_back(ToVecInt);
19847 return;
19848 }
19849
19850 SmallVector<SDValue, 8> Elts;
19851 for (unsigned i = 0, e = NumElts; i != e; ++i)
19852 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
19853 ToVecInt, DAG.getIntPtrConstant(i)));
19854
19855 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
19856 }
19857 }
19858}
19859
19860const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
19861 switch (Opcode) {
19862 default: return nullptr;
19863 case X86ISD::BSF: return "X86ISD::BSF";
19864 case X86ISD::BSR: return "X86ISD::BSR";
19865 case X86ISD::SHLD: return "X86ISD::SHLD";
19866 case X86ISD::SHRD: return "X86ISD::SHRD";
19867 case X86ISD::FAND: return "X86ISD::FAND";
19868 case X86ISD::FANDN: return "X86ISD::FANDN";
19869 case X86ISD::FOR: return "X86ISD::FOR";
19870 case X86ISD::FXOR: return "X86ISD::FXOR";
19871 case X86ISD::FSRL: return "X86ISD::FSRL";
19872 case X86ISD::FILD: return "X86ISD::FILD";
19873 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
19874 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
19875 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
19876 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
19877 case X86ISD::FLD: return "X86ISD::FLD";
19878 case X86ISD::FST: return "X86ISD::FST";
19879 case X86ISD::CALL: return "X86ISD::CALL";
19880 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
19881 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
19882 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
19883 case X86ISD::BT: return "X86ISD::BT";
19884 case X86ISD::CMP: return "X86ISD::CMP";
19885 case X86ISD::COMI: return "X86ISD::COMI";
19886 case X86ISD::UCOMI: return "X86ISD::UCOMI";
19887 case X86ISD::CMPM: return "X86ISD::CMPM";
19888 case X86ISD::CMPMU: return "X86ISD::CMPMU";
19889 case X86ISD::SETCC: return "X86ISD::SETCC";
19890 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
19891 case X86ISD::FSETCC: return "X86ISD::FSETCC";
19892 case X86ISD::CMOV: return "X86ISD::CMOV";
19893 case X86ISD::BRCOND: return "X86ISD::BRCOND";
19894 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
19895 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
19896 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
19897 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
19898 case X86ISD::Wrapper: return "X86ISD::Wrapper";
19899 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
19900 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
19901 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
19902 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
19903 case X86ISD::PINSRB: return "X86ISD::PINSRB";
19904 case X86ISD::PINSRW: return "X86ISD::PINSRW";
19905 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
19906 case X86ISD::ANDNP: return "X86ISD::ANDNP";
19907 case X86ISD::PSIGN: return "X86ISD::PSIGN";
19908 case X86ISD::BLENDI: return "X86ISD::BLENDI";
19909 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
19910 case X86ISD::SUBUS: return "X86ISD::SUBUS";
19911 case X86ISD::HADD: return "X86ISD::HADD";
19912 case X86ISD::HSUB: return "X86ISD::HSUB";
19913 case X86ISD::FHADD: return "X86ISD::FHADD";
19914 case X86ISD::FHSUB: return "X86ISD::FHSUB";
19915 case X86ISD::UMAX: return "X86ISD::UMAX";
19916 case X86ISD::UMIN: return "X86ISD::UMIN";
19917 case X86ISD::SMAX: return "X86ISD::SMAX";
19918 case X86ISD::SMIN: return "X86ISD::SMIN";
19919 case X86ISD::FMAX: return "X86ISD::FMAX";
19920 case X86ISD::FMIN: return "X86ISD::FMIN";
19921 case X86ISD::FMAXC: return "X86ISD::FMAXC";
19922 case X86ISD::FMINC: return "X86ISD::FMINC";
19923 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
19924 case X86ISD::FRCP: return "X86ISD::FRCP";
19925 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
19926 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
19927 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
19928 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
19929 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
19930 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
19931 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
19932 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
19933 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
19934 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
19935 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
19936 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
19937 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
19938 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
19939 case X86ISD::VZEXT: return "X86ISD::VZEXT";
19940 case X86ISD::VSEXT: return "X86ISD::VSEXT";
19941 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
19942 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
19943 case X86ISD::VINSERT: return "X86ISD::VINSERT";
19944 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
19945 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
19946 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
19947 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
19948 case X86ISD::VSHL: return "X86ISD::VSHL";
19949 case X86ISD::VSRL: return "X86ISD::VSRL";
19950 case X86ISD::VSRA: return "X86ISD::VSRA";
19951 case X86ISD::VSHLI: return "X86ISD::VSHLI";
19952 case X86ISD::VSRLI: return "X86ISD::VSRLI";
19953 case X86ISD::VSRAI: return "X86ISD::VSRAI";
19954 case X86ISD::CMPP: return "X86ISD::CMPP";
19955 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
19956 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
19957 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
19958 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
19959 case X86ISD::ADD: return "X86ISD::ADD";
19960 case X86ISD::SUB: return "X86ISD::SUB";
19961 case X86ISD::ADC: return "X86ISD::ADC";
19962 case X86ISD::SBB: return "X86ISD::SBB";
19963 case X86ISD::SMUL: return "X86ISD::SMUL";
19964 case X86ISD::UMUL: return "X86ISD::UMUL";
19965 case X86ISD::SMUL8: return "X86ISD::SMUL8";
19966 case X86ISD::UMUL8: return "X86ISD::UMUL8";
19967 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
19968 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
19969 case X86ISD::INC: return "X86ISD::INC";
19970 case X86ISD::DEC: return "X86ISD::DEC";
19971 case X86ISD::OR: return "X86ISD::OR";
19972 case X86ISD::XOR: return "X86ISD::XOR";
19973 case X86ISD::AND: return "X86ISD::AND";
19974 case X86ISD::BEXTR: return "X86ISD::BEXTR";
19975 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
19976 case X86ISD::PTEST: return "X86ISD::PTEST";
19977 case X86ISD::TESTP: return "X86ISD::TESTP";
19978 case X86ISD::TESTM: return "X86ISD::TESTM";
19979 case X86ISD::TESTNM: return "X86ISD::TESTNM";
19980 case X86ISD::KORTEST: return "X86ISD::KORTEST";
19981 case X86ISD::PACKSS: return "X86ISD::PACKSS";
19982 case X86ISD::PACKUS: return "X86ISD::PACKUS";
19983 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
19984 case X86ISD::VALIGN: return "X86ISD::VALIGN";
19985 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
19986 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
19987 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
19988 case X86ISD::SHUFP: return "X86ISD::SHUFP";
19989 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
19990 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
19991 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
19992 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
19993 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
19994 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
19995 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
19996 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
19997 case X86ISD::MOVSD: return "X86ISD::MOVSD";
19998 case X86ISD::MOVSS: return "X86ISD::MOVSS";
19999 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
20000 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
20001 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
20002 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
20003 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
20004 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
20005 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
20006 case X86ISD::VPERMV: return "X86ISD::VPERMV";
20007 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
20008 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
20009 case X86ISD::VPERMI: return "X86ISD::VPERMI";
20010 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
20011 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
20012 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20013 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
20014 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
20015 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
20016 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
20017 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
20018 case X86ISD::SAHF: return "X86ISD::SAHF";
20019 case X86ISD::RDRAND: return "X86ISD::RDRAND";
20020 case X86ISD::RDSEED: return "X86ISD::RDSEED";
20021 case X86ISD::FMADD: return "X86ISD::FMADD";
20022 case X86ISD::FMSUB: return "X86ISD::FMSUB";
20023 case X86ISD::FNMADD: return "X86ISD::FNMADD";
20024 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
20025 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
20026 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
20027 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
20028 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
20029 case X86ISD::XTEST: return "X86ISD::XTEST";
20030 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
20031 case X86ISD::EXPAND: return "X86ISD::EXPAND";
20032 case X86ISD::SELECT: return "X86ISD::SELECT";
20033 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
20034 case X86ISD::RCP28: return "X86ISD::RCP28";
20035 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
20036 }
20037}
20038
20039// isLegalAddressingMode - Return true if the addressing mode represented
20040// by AM is legal for this target, for a load/store of the specified type.
20041bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20042 Type *Ty) const {
20043 // X86 supports extremely general addressing modes.
20044 CodeModel::Model M = getTargetMachine().getCodeModel();
20045 Reloc::Model R = getTargetMachine().getRelocationModel();
20046
20047 // X86 allows a sign-extended 32-bit immediate field as a displacement.
20048 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20049 return false;
20050
20051 if (AM.BaseGV) {
20052 unsigned GVFlags =
20053 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20054
20055 // If a reference to this global requires an extra load, we can't fold it.
20056 if (isGlobalStubReference(GVFlags))
20057 return false;
20058
20059 // If BaseGV requires a register for the PIC base, we cannot also have a
20060 // BaseReg specified.
20061 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20062 return false;
20063
20064 // If lower 4G is not available, then we must use rip-relative addressing.
20065 if ((M != CodeModel::Small || R != Reloc::Static) &&
20066 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20067 return false;
20068 }
20069
20070 switch (AM.Scale) {
20071 case 0:
20072 case 1:
20073 case 2:
20074 case 4:
20075 case 8:
20076 // These scales always work.
20077 break;
20078 case 3:
20079 case 5:
20080 case 9:
20081 // These scales are formed with basereg+scalereg. Only accept if there is
20082 // no basereg yet.
20083 if (AM.HasBaseReg)
20084 return false;
20085 break;
20086 default: // Other stuff never works.
20087 return false;
20088 }
20089
20090 return true;
20091}
20092
20093bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20094 unsigned Bits = Ty->getScalarSizeInBits();
20095
20096 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20097 // particularly cheaper than those without.
20098 if (Bits == 8)
20099 return false;
20100
20101 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20102 // variable shifts just as cheap as scalar ones.
20103 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20104 return false;
20105
20106 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20107 // fully general vector.
20108 return true;
20109}
20110
20111bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20112 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20113 return false;
20114 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20115 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20116 return NumBits1 > NumBits2;
20117}
20118
20119bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20120 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20121 return false;
20122
20123 if (!isTypeLegal(EVT::getEVT(Ty1)))
20124 return false;
20125
20126 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20126, __PRETTY_FUNCTION__))
;
20127
20128 // Assuming the caller doesn't have a zeroext or signext return parameter,
20129 // truncation all the way down to i1 is valid.
20130 return true;
20131}
20132
20133bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20134 return isInt<32>(Imm);
20135}
20136
20137bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20138 // Can also use sub to handle negated immediates.
20139 return isInt<32>(Imm);
20140}
20141
20142bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20143 if (!VT1.isInteger() || !VT2.isInteger())
20144 return false;
20145 unsigned NumBits1 = VT1.getSizeInBits();
20146 unsigned NumBits2 = VT2.getSizeInBits();
20147 return NumBits1 > NumBits2;
20148}
20149
20150bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20151 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20152 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20153}
20154
20155bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20156 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20157 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20158}
20159
20160bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20161 EVT VT1 = Val.getValueType();
20162 if (isZExtFree(VT1, VT2))
20163 return true;
20164
20165 if (Val.getOpcode() != ISD::LOAD)
20166 return false;
20167
20168 if (!VT1.isSimple() || !VT1.isInteger() ||
20169 !VT2.isSimple() || !VT2.isInteger())
20170 return false;
20171
20172 switch (VT1.getSimpleVT().SimpleTy) {
20173 default: break;
20174 case MVT::i8:
20175 case MVT::i16:
20176 case MVT::i32:
20177 // X86 has 8, 16, and 32-bit zero-extending loads.
20178 return true;
20179 }
20180
20181 return false;
20182}
20183
20184bool
20185X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20186 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20187 return false;
20188
20189 VT = VT.getScalarType();
20190
20191 if (!VT.isSimple())
20192 return false;
20193
20194 switch (VT.getSimpleVT().SimpleTy) {
20195 case MVT::f32:
20196 case MVT::f64:
20197 return true;
20198 default:
20199 break;
20200 }
20201
20202 return false;
20203}
20204
20205bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20206 // i16 instructions are longer (0x66 prefix) and potentially slower.
20207 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20208}
20209
20210/// isShuffleMaskLegal - Targets can use this to indicate that they only
20211/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20212/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20213/// are assumed to be legal.
20214bool
20215X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20216 EVT VT) const {
20217 if (!VT.isSimple())
20218 return false;
20219
20220 MVT SVT = VT.getSimpleVT();
20221
20222 // Very little shuffling can be done for 64-bit vectors right now.
20223 if (VT.getSizeInBits() == 64)
20224 return false;
20225
20226 // This is an experimental legality test that is tailored to match the
20227 // legality test of the experimental lowering more closely. They are gated
20228 // separately to ease testing of performance differences.
20229 if (ExperimentalVectorShuffleLegality)
20230 // We only care that the types being shuffled are legal. The lowering can
20231 // handle any possible shuffle mask that results.
20232 return isTypeLegal(SVT);
20233
20234 // If this is a single-input shuffle with no 128 bit lane crossings we can
20235 // lower it into pshufb.
20236 if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20237 (SVT.is256BitVector() && Subtarget->hasInt256())) {
20238 bool isLegal = true;
20239 for (unsigned I = 0, E = M.size(); I != E; ++I) {
20240 if (M[I] >= (int)SVT.getVectorNumElements() ||
20241 ShuffleCrosses128bitLane(SVT, I, M[I])) {
20242 isLegal = false;
20243 break;
20244 }
20245 }
20246 if (isLegal)
20247 return true;
20248 }
20249
20250 // FIXME: blends, shifts.
20251 return (SVT.getVectorNumElements() == 2 ||
20252 ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20253 isMOVLMask(M, SVT) ||
20254 isCommutedMOVLMask(M, SVT) ||
20255 isMOVHLPSMask(M, SVT) ||
20256 isSHUFPMask(M, SVT) ||
20257 isSHUFPMask(M, SVT, /* Commuted */ true) ||
20258 isPSHUFDMask(M, SVT) ||
20259 isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20260 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20261 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20262 isPALIGNRMask(M, SVT, Subtarget) ||
20263 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20264 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20265 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20266 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20267 isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20268 (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20269}
20270
20271bool
20272X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20273 EVT VT) const {
20274 if (!VT.isSimple())
20275 return false;
20276
20277 MVT SVT = VT.getSimpleVT();
20278
20279 // This is an experimental legality test that is tailored to match the
20280 // legality test of the experimental lowering more closely. They are gated
20281 // separately to ease testing of performance differences.
20282 if (ExperimentalVectorShuffleLegality)
20283 // The new vector shuffle lowering is very good at managing zero-inputs.
20284 return isShuffleMaskLegal(Mask, VT);
20285
20286 unsigned NumElts = SVT.getVectorNumElements();
20287 // FIXME: This collection of masks seems suspect.
20288 if (NumElts == 2)
20289 return true;
20290 if (NumElts == 4 && SVT.is128BitVector()) {
20291 return (isMOVLMask(Mask, SVT) ||
20292 isCommutedMOVLMask(Mask, SVT, true) ||
20293 isSHUFPMask(Mask, SVT) ||
20294 isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20295 isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20296 Subtarget->hasInt256()));
20297 }
20298 return false;
20299}
20300
20301//===----------------------------------------------------------------------===//
20302// X86 Scheduler Hooks
20303//===----------------------------------------------------------------------===//
20304
20305/// Utility function to emit xbegin specifying the start of an RTM region.
20306static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20307 const TargetInstrInfo *TII) {
20308 DebugLoc DL = MI->getDebugLoc();
20309
20310 const BasicBlock *BB = MBB->getBasicBlock();
20311 MachineFunction::iterator I = MBB;
20312 ++I;
20313
20314 // For the v = xbegin(), we generate
20315 //
20316 // thisMBB:
20317 // xbegin sinkMBB
20318 //
20319 // mainMBB:
20320 // eax = -1
20321 //
20322 // sinkMBB:
20323 // v = eax
20324
20325 MachineBasicBlock *thisMBB = MBB;
20326 MachineFunction *MF = MBB->getParent();
20327 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20328 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20329 MF->insert(I, mainMBB);
20330 MF->insert(I, sinkMBB);
20331
20332 // Transfer the remainder of BB and its successor edges to sinkMBB.
20333 sinkMBB->splice(sinkMBB->begin(), MBB,
20334 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20335 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20336
20337 // thisMBB:
20338 // xbegin sinkMBB
20339 // # fallthrough to mainMBB
20340 // # abortion to sinkMBB
20341 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20342 thisMBB->addSuccessor(mainMBB);
20343 thisMBB->addSuccessor(sinkMBB);
20344
20345 // mainMBB:
20346 // EAX = -1
20347 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20348 mainMBB->addSuccessor(sinkMBB);
20349
20350 // sinkMBB:
20351 // EAX is live into the sinkMBB
20352 sinkMBB->addLiveIn(X86::EAX);
20353 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20354 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20355 .addReg(X86::EAX);
20356
20357 MI->eraseFromParent();
20358 return sinkMBB;
20359}
20360
20361// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20362// or XMM0_V32I8 in AVX all of this code can be replaced with that
20363// in the .td file.
20364static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20365 const TargetInstrInfo *TII) {
20366 unsigned Opc;
20367 switch (MI->getOpcode()) {
20368 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20368)
;
20369 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
20370 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20371 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
20372 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20373 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
20374 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20375 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
20376 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20377 }
20378
20379 DebugLoc dl = MI->getDebugLoc();
20380 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20381
20382 unsigned NumArgs = MI->getNumOperands();
20383 for (unsigned i = 1; i < NumArgs; ++i) {
20384 MachineOperand &Op = MI->getOperand(i);
20385 if (!(Op.isReg() && Op.isImplicit()))
20386 MIB.addOperand(Op);
20387 }
20388 if (MI->hasOneMemOperand())
20389 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20390
20391 BuildMI(*BB, MI, dl,
20392 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20393 .addReg(X86::XMM0);
20394
20395 MI->eraseFromParent();
20396 return BB;
20397}
20398
20399// FIXME: Custom handling because TableGen doesn't support multiple implicit
20400// defs in an instruction pattern
20401static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20402 const TargetInstrInfo *TII) {
20403 unsigned Opc;
20404 switch (MI->getOpcode()) {
20405 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20405)
;
20406 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
20407 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20408 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
20409 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20410 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
20411 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20412 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
20413 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20414 }
20415
20416 DebugLoc dl = MI->getDebugLoc();
20417 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20418
20419 unsigned NumArgs = MI->getNumOperands(); // remove the results
20420 for (unsigned i = 1; i < NumArgs; ++i) {
20421 MachineOperand &Op = MI->getOperand(i);
20422 if (!(Op.isReg() && Op.isImplicit()))
20423 MIB.addOperand(Op);
20424 }
20425 if (MI->hasOneMemOperand())
20426 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20427
20428 BuildMI(*BB, MI, dl,
20429 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20430 .addReg(X86::ECX);
20431
20432 MI->eraseFromParent();
20433 return BB;
20434}
20435
20436static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20437 const TargetInstrInfo *TII,
20438 const X86Subtarget* Subtarget) {
20439 DebugLoc dl = MI->getDebugLoc();
20440
20441 // Address into RAX/EAX, other two args into ECX, EDX.
20442 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20443 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20444 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20445 for (int i = 0; i < X86::AddrNumOperands; ++i)
20446 MIB.addOperand(MI->getOperand(i));
20447
20448 unsigned ValOps = X86::AddrNumOperands;
20449 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20450 .addReg(MI->getOperand(ValOps).getReg());
20451 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20452 .addReg(MI->getOperand(ValOps+1).getReg());
20453
20454 // The instruction doesn't actually take any operands though.
20455 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20456
20457 MI->eraseFromParent(); // The pseudo is gone now.
20458 return BB;
20459}
20460
20461MachineBasicBlock *
20462X86TargetLowering::EmitVAARG64WithCustomInserter(
20463 MachineInstr *MI,
20464 MachineBasicBlock *MBB) const {
20465 // Emit va_arg instruction on X86-64.
20466
20467 // Operands to this pseudo-instruction:
20468 // 0 ) Output : destination address (reg)
20469 // 1-5) Input : va_list address (addr, i64mem)
20470 // 6 ) ArgSize : Size (in bytes) of vararg type
20471 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20472 // 8 ) Align : Alignment of type
20473 // 9 ) EFLAGS (implicit-def)
20474
20475 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI->getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20475, __PRETTY_FUNCTION__))
;
20476 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands")((X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"
) ? static_cast<void> (0) : __assert_fail ("X86::AddrNumOperands == 5 && \"VAARG_64 assumes 5 address operands\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20476, __PRETTY_FUNCTION__))
;
20477
20478 unsigned DestReg = MI->getOperand(0).getReg();
20479 MachineOperand &Base = MI->getOperand(1);
20480 MachineOperand &Scale = MI->getOperand(2);
20481 MachineOperand &Index = MI->getOperand(3);
20482 MachineOperand &Disp = MI->getOperand(4);
20483 MachineOperand &Segment = MI->getOperand(5);
20484 unsigned ArgSize = MI->getOperand(6).getImm();
20485 unsigned ArgMode = MI->getOperand(7).getImm();
20486 unsigned Align = MI->getOperand(8).getImm();
20487
20488 // Memory Reference
20489 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI->hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20489, __PRETTY_FUNCTION__))
;
20490 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20491 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20492
20493 // Machine Information
20494 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20495 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20496 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20497 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20498 DebugLoc DL = MI->getDebugLoc();
20499
20500 // struct va_list {
20501 // i32 gp_offset
20502 // i32 fp_offset
20503 // i64 overflow_area (address)
20504 // i64 reg_save_area (address)
20505 // }
20506 // sizeof(va_list) = 24
20507 // alignment(va_list) = 8
20508
20509 unsigned TotalNumIntRegs = 6;
20510 unsigned TotalNumXMMRegs = 8;
20511 bool UseGPOffset = (ArgMode == 1);
20512 bool UseFPOffset = (ArgMode == 2);
20513 unsigned MaxOffset = TotalNumIntRegs * 8 +
20514 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20515
20516 /* Align ArgSize to a multiple of 8 */
20517 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20518 bool NeedsAlign = (Align > 8);
20519
20520 MachineBasicBlock *thisMBB = MBB;
20521 MachineBasicBlock *overflowMBB;
20522 MachineBasicBlock *offsetMBB;
20523 MachineBasicBlock *endMBB;
20524
20525 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
20526 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
20527 unsigned OffsetReg = 0;
20528
20529 if (!UseGPOffset && !UseFPOffset) {
20530 // If we only pull from the overflow region, we don't create a branch.
20531 // We don't need to alter control flow.
20532 OffsetDestReg = 0; // unused
20533 OverflowDestReg = DestReg;
20534
20535 offsetMBB = nullptr;
20536 overflowMBB = thisMBB;
20537 endMBB = thisMBB;
20538 } else {
20539 // First emit code to check if gp_offset (or fp_offset) is below the bound.
20540 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20541 // If not, pull from overflow_area. (branch to overflowMBB)
20542 //
20543 // thisMBB
20544 // | .
20545 // | .
20546 // offsetMBB overflowMBB
20547 // | .
20548 // | .
20549 // endMBB
20550
20551 // Registers for the PHI in endMBB
20552 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20553 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20554
20555 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20556 MachineFunction *MF = MBB->getParent();
20557 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20558 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20559 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20560
20561 MachineFunction::iterator MBBIter = MBB;
20562 ++MBBIter;
20563
20564 // Insert the new basic blocks
20565 MF->insert(MBBIter, offsetMBB);
20566 MF->insert(MBBIter, overflowMBB);
20567 MF->insert(MBBIter, endMBB);
20568
20569 // Transfer the remainder of MBB and its successor edges to endMBB.
20570 endMBB->splice(endMBB->begin(), thisMBB,
20571 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20572 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20573
20574 // Make offsetMBB and overflowMBB successors of thisMBB
20575 thisMBB->addSuccessor(offsetMBB);
20576 thisMBB->addSuccessor(overflowMBB);
20577
20578 // endMBB is a successor of both offsetMBB and overflowMBB
20579 offsetMBB->addSuccessor(endMBB);
20580 overflowMBB->addSuccessor(endMBB);
20581
20582 // Load the offset value into a register
20583 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20584 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20585 .addOperand(Base)
20586 .addOperand(Scale)
20587 .addOperand(Index)
20588 .addDisp(Disp, UseFPOffset ? 4 : 0)
20589 .addOperand(Segment)
20590 .setMemRefs(MMOBegin, MMOEnd);
20591
20592 // Check if there is enough room left to pull this argument.
20593 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20594 .addReg(OffsetReg)
20595 .addImm(MaxOffset + 8 - ArgSizeA8);
20596
20597 // Branch to "overflowMBB" if offset >= max
20598 // Fall through to "offsetMBB" otherwise
20599 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20600 .addMBB(overflowMBB);
20601 }
20602
20603 // In offsetMBB, emit code to use the reg_save_area.
20604 if (offsetMBB) {
20605 assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20605, __PRETTY_FUNCTION__))
;
20606
20607 // Read the reg_save_area address.
20608 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20609 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20610 .addOperand(Base)
20611 .addOperand(Scale)
20612 .addOperand(Index)
20613 .addDisp(Disp, 16)
20614 .addOperand(Segment)
20615 .setMemRefs(MMOBegin, MMOEnd);
20616
20617 // Zero-extend the offset
20618 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20619 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20620 .addImm(0)
20621 .addReg(OffsetReg)
20622 .addImm(X86::sub_32bit);
20623
20624 // Add the offset to the reg_save_area to get the final address.
20625 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20626 .addReg(OffsetReg64)
20627 .addReg(RegSaveReg);
20628
20629 // Compute the offset for the next argument
20630 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20631 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20632 .addReg(OffsetReg)
20633 .addImm(UseFPOffset ? 16 : 8);
20634
20635 // Store it back into the va_list.
20636 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20637 .addOperand(Base)
20638 .addOperand(Scale)
20639 .addOperand(Index)
20640 .addDisp(Disp, UseFPOffset ? 4 : 0)
20641 .addOperand(Segment)
20642 .addReg(NextOffsetReg)
20643 .setMemRefs(MMOBegin, MMOEnd);
20644
20645 // Jump to endMBB
20646 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20647 .addMBB(endMBB);
20648 }
20649
20650 //
20651 // Emit code to use overflow area
20652 //
20653
20654 // Load the overflow_area address into a register.
20655 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20656 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20657 .addOperand(Base)
20658 .addOperand(Scale)
20659 .addOperand(Index)
20660 .addDisp(Disp, 8)
20661 .addOperand(Segment)
20662 .setMemRefs(MMOBegin, MMOEnd);
20663
20664 // If we need to align it, do so. Otherwise, just copy the address
20665 // to OverflowDestReg.
20666 if (NeedsAlign) {
20667 // Align the overflow address
20668 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2")(((Align & (Align-1)) == 0 && "Alignment must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("(Align & (Align-1)) == 0 && \"Alignment must be a power of 2\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20668, __PRETTY_FUNCTION__))
;
20669 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20670
20671 // aligned_addr = (addr + (align-1)) & ~(align-1)
20672 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20673 .addReg(OverflowAddrReg)
20674 .addImm(Align-1);
20675
20676 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20677 .addReg(TmpReg)
20678 .addImm(~(uint64_t)(Align-1));
20679 } else {
20680 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20681 .addReg(OverflowAddrReg);
20682 }
20683
20684 // Compute the next overflow address after this argument.
20685 // (the overflow address should be kept 8-byte aligned)
20686 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20687 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20688 .addReg(OverflowDestReg)
20689 .addImm(ArgSizeA8);
20690
20691 // Store the new overflow address.
20692 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20693 .addOperand(Base)
20694 .addOperand(Scale)
20695 .addOperand(Index)
20696 .addDisp(Disp, 8)
20697 .addOperand(Segment)
20698 .addReg(NextAddrReg)
20699 .setMemRefs(MMOBegin, MMOEnd);
20700
20701 // If we branched, emit the PHI to the front of endMBB.
20702 if (offsetMBB) {
20703 BuildMI(*endMBB, endMBB->begin(), DL,
20704 TII->get(X86::PHI), DestReg)
20705 .addReg(OffsetDestReg).addMBB(offsetMBB)
20706 .addReg(OverflowDestReg).addMBB(overflowMBB);
20707 }
20708
20709 // Erase the pseudo instruction
20710 MI->eraseFromParent();
20711
20712 return endMBB;
20713}
20714
20715MachineBasicBlock *
20716X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20717 MachineInstr *MI,
20718 MachineBasicBlock *MBB) const {
20719 // Emit code to save XMM registers to the stack. The ABI says that the
20720 // number of registers to save is given in %al, so it's theoretically
20721 // possible to do an indirect jump trick to avoid saving all of them,
20722 // however this code takes a simpler approach and just executes all
20723 // of the stores if %al is non-zero. It's less code, and it's probably
20724 // easier on the hardware branch predictor, and stores aren't all that
20725 // expensive anyway.
20726
20727 // Create the new basic blocks. One block contains all the XMM stores,
20728 // and one block is the final destination regardless of whether any
20729 // stores were performed.
20730 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20731 MachineFunction *F = MBB->getParent();
20732 MachineFunction::iterator MBBIter = MBB;
20733 ++MBBIter;
20734 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20735 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20736 F->insert(MBBIter, XMMSaveMBB);
20737 F->insert(MBBIter, EndMBB);
20738
20739 // Transfer the remainder of MBB and its successor edges to EndMBB.
20740 EndMBB->splice(EndMBB->begin(), MBB,
20741 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20742 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20743
20744 // The original block will now fall through to the XMM save block.
20745 MBB->addSuccessor(XMMSaveMBB);
20746 // The XMMSaveMBB will fall through to the end block.
20747 XMMSaveMBB->addSuccessor(EndMBB);
20748
20749 // Now add the instructions.
20750 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20751 DebugLoc DL = MI->getDebugLoc();
20752
20753 unsigned CountReg = MI->getOperand(0).getReg();
20754 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20755 int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20756
20757 if (!Subtarget->isTargetWin64()) {
20758 // If %al is 0, branch around the XMM save block.
20759 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20760 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20761 MBB->addSuccessor(EndMBB);
20762 }
20763
20764 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20765 // that was just emitted, but clearly shouldn't be "saved".
20766 assert((MI->getNumOperands() <= 3 ||(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))
20767 !MI->getOperand(MI->getNumOperands() - 1).isReg() ||(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))
20768 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))
20769 && "Expected last argument to be EFLAGS")(((MI->getNumOperands() <= 3 || !MI->getOperand(MI->
getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands
() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI->getNumOperands() <= 3 || !MI->getOperand(MI->getNumOperands() - 1).isReg() || MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20769, __PRETTY_FUNCTION__))
;
20770 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20771 // In the XMM save block, save all the XMM argument registers.
20772 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20773 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20774 MachineMemOperand *MMO =
20775 F->getMachineMemOperand(
20776 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20777 MachineMemOperand::MOStore,
20778 /*Size=*/16, /*Align=*/16);
20779 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20780 .addFrameIndex(RegSaveFrameIndex)
20781 .addImm(/*Scale=*/1)
20782 .addReg(/*IndexReg=*/0)
20783 .addImm(/*Disp=*/Offset)
20784 .addReg(/*Segment=*/0)
20785 .addReg(MI->getOperand(i).getReg())
20786 .addMemOperand(MMO);
20787 }
20788
20789 MI->eraseFromParent(); // The pseudo instruction is gone now.
20790
20791 return EndMBB;
20792}
20793
20794// The EFLAGS operand of SelectItr might be missing a kill marker
20795// because there were multiple uses of EFLAGS, and ISel didn't know
20796// which to mark. Figure out whether SelectItr should have had a
20797// kill marker, and set it if it should. Returns the correct kill
20798// marker value.
20799static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
20800 MachineBasicBlock* BB,
20801 const TargetRegisterInfo* TRI) {
20802 // Scan forward through BB for a use/def of EFLAGS.
20803 MachineBasicBlock::iterator miI(std::next(SelectItr));
20804 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
20805 const MachineInstr& mi = *miI;
20806 if (mi.readsRegister(X86::EFLAGS))
20807 return false;
20808 if (mi.definesRegister(X86::EFLAGS))
20809 break; // Should have kill-flag - update below.
20810 }
20811
20812 // If we hit the end of the block, check whether EFLAGS is live into a
20813 // successor.
20814 if (miI == BB->end()) {
20815 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
20816 sEnd = BB->succ_end();
20817 sItr != sEnd; ++sItr) {
20818 MachineBasicBlock* succ = *sItr;
20819 if (succ->isLiveIn(X86::EFLAGS))
20820 return false;
20821 }
20822 }
20823
20824 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
20825 // out. SelectMI should have a kill flag on EFLAGS.
20826 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
20827 return true;
20828}
20829
20830MachineBasicBlock *
20831X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
20832 MachineBasicBlock *BB) const {
20833 const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
20834 DebugLoc DL = MI->getDebugLoc();
20835
20836 // To "insert" a SELECT_CC instruction, we actually have to insert the
20837 // diamond control-flow pattern. The incoming instruction knows the
20838 // destination vreg to set, the condition code register to branch on, the
20839 // true/false values to select between, and a branch opcode to use.
20840 const BasicBlock *LLVM_BB = BB->getBasicBlock();
20841 MachineFunction::iterator It = BB;
20842 ++It;
20843
20844 // thisMBB:
20845 // ...
20846 // TrueVal = ...
20847 // cmpTY ccX, r1, r2
20848 // bCC copy1MBB
20849 // fallthrough --> copy0MBB
20850 MachineBasicBlock *thisMBB = BB;
20851 MachineFunction *F = BB->getParent();
20852 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
20853 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
20854 F->insert(It, copy0MBB);
20855 F->insert(It, sinkMBB);
20856
20857 // If the EFLAGS register isn't dead in the terminator, then claim that it's
20858 // live into the sink and copy blocks.
20859 const TargetRegisterInfo *TRI =
20860 BB->getParent()->getSubtarget().getRegisterInfo();
20861 if (!MI->killsRegister(X86::EFLAGS) &&
20862 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
20863 copy0MBB->addLiveIn(X86::EFLAGS);
20864 sinkMBB->addLiveIn(X86::EFLAGS);
20865 }
20866
20867 // Transfer the remainder of BB and its successor edges to sinkMBB.
20868 sinkMBB->splice(sinkMBB->begin(), BB,
20869 std::next(MachineBasicBlock::iterator(MI)), BB->end());
20870 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
20871
20872 // Add the true and fallthrough blocks as its successors.
20873 BB->addSuccessor(copy0MBB);
20874 BB->addSuccessor(sinkMBB);
20875
20876 // Create the conditional branch instruction.
20877 unsigned Opc =
20878 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
20879 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
20880
20881 // copy0MBB:
20882 // %FalseValue = ...
20883 // # fallthrough to sinkMBB
20884 copy0MBB->addSuccessor(sinkMBB);
20885
20886 // sinkMBB:
20887 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
20888 // ...
20889 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20890 TII->get(X86::PHI), MI->getOperand(0).getReg())
20891 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
20892 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
20893
20894 MI->eraseFromParent(); // The pseudo instruction is gone now.
20895 return sinkMBB;
20896}
20897
20898MachineBasicBlock *
20899X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
20900 MachineBasicBlock *BB) const {
20901 MachineFunction *MF = BB->getParent();
20902 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
20903 DebugLoc DL = MI->getDebugLoc();
20904 const BasicBlock *LLVM_BB = BB->getBasicBlock();
20905
20906 assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 20906, __PRETTY_FUNCTION__))
;
20907
20908 const bool Is64Bit = Subtarget->is64Bit();
20909 const bool IsLP64 = Subtarget->isTarget64BitLP64();
20910
20911 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
20912 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
20913
20914 // BB:
20915 // ... [Till the alloca]
20916 // If stacklet is not large enough, jump to mallocMBB
20917 //
20918 // bumpMBB:
20919 // Allocate by subtracting from RSP
20920 // Jump to continueMBB
20921 //
20922 // mallocMBB:
20923 // Allocate by call to runtime
20924 //
20925 // continueMBB:
20926 // ...
20927 // [rest of original BB]
20928 //
20929
20930 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20931 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20932 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20933
20934 MachineRegisterInfo &MRI = MF->getRegInfo();
20935 const TargetRegisterClass *AddrRegClass =
20936 getRegClassFor(getPointerTy());
20937
20938 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20939 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20940 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
20941 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
20942 sizeVReg = MI->getOperand(1).getReg(),
20943 physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
20944
20945 MachineFunction::iterator MBBIter = BB;
20946 ++MBBIter;
20947
20948 MF->insert(MBBIter, bumpMBB);
20949 MF->insert(MBBIter, mallocMBB);
20950 MF->insert(MBBIter, continueMBB);
20951
20952 continueMBB->splice(continueMBB->begin(), BB,
20953 std::next(MachineBasicBlock::iterator(MI)), BB->end());
20954 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
20955
20956 // Add code to the main basic block to check if the stack limit has been hit,
20957 // and if so, jump to mallocMBB otherwise to bumpMBB.
20958 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
20959 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
20960 .addReg(tmpSPVReg).addReg(sizeVReg);
20961 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
20962 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
20963 .addReg(SPLimitVReg);
20964 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
20965
20966 // bumpMBB simply decreases the stack pointer, since we know the current
20967 // stacklet has enough space.
20968 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
20969 .addReg(SPLimitVReg);
20970 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
20971 .addReg(SPLimitVReg);
20972 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
20973
20974 // Calls into a routine in libgcc to allocate more space from the heap.
20975 const uint32_t *RegMask = MF->getTarget()
20976 .getSubtargetImpl()
20977 ->getRegisterInfo()
20978 ->getCallPreservedMask(CallingConv::C);
20979 if (IsLP64) {
20980 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
20981 .addReg(sizeVReg);
20982 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
20983 .addExternalSymbol("__morestack_allocate_stack_space")
20984 .addRegMask(RegMask)
20985 .addReg(X86::RDI, RegState::Implicit)
20986 .addReg(X86::RAX, RegState::ImplicitDefine);
20987 } else if (Is64Bit) {
20988 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
20989 .addReg(sizeVReg);
20990 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
20991 .addExternalSymbol("__morestack_allocate_stack_space")
20992 .addRegMask(RegMask)
20993 .addReg(X86::EDI, RegState::Implicit)
20994 .addReg(X86::EAX, RegState::ImplicitDefine);
20995 } else {
20996 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
20997 .addImm(12);
20998 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
20999 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21000 .addExternalSymbol("__morestack_allocate_stack_space")
21001 .addRegMask(RegMask)
21002 .addReg(X86::EAX, RegState::ImplicitDefine);
21003 }
21004
21005 if (!Is64Bit)
21006 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21007 .addImm(16);
21008
21009 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21010 .addReg(IsLP64 ? X86::RAX : X86::EAX);
21011 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21012
21013 // Set up the CFG correctly.
21014 BB->addSuccessor(bumpMBB);
21015 BB->addSuccessor(mallocMBB);
21016 mallocMBB->addSuccessor(continueMBB);
21017 bumpMBB->addSuccessor(continueMBB);
21018
21019 // Take care of the PHI nodes.
21020 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21021 MI->getOperand(0).getReg())
21022 .addReg(mallocPtrVReg).addMBB(mallocMBB)
21023 .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21024
21025 // Delete the original pseudo instruction.
21026 MI->eraseFromParent();
21027
21028 // And we're done.
21029 return continueMBB;
21030}
21031
21032MachineBasicBlock *
21033X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21034 MachineBasicBlock *BB) const {
21035 DebugLoc DL = MI->getDebugLoc();
21036
21037 assert(!Subtarget->isTargetMachO())((!Subtarget->isTargetMachO()) ? static_cast<void> (
0) : __assert_fail ("!Subtarget->isTargetMachO()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21037, __PRETTY_FUNCTION__))
;
21038
21039 X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21040
21041 MI->eraseFromParent(); // The pseudo instruction is gone now.
21042 return BB;
21043}
21044
21045MachineBasicBlock *
21046X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21047 MachineBasicBlock *BB) const {
21048 // This is pretty easy. We're taking the value that we received from
21049 // our load from the relocation, sticking it in either RDI (x86-64)
21050 // or EAX and doing an indirect call. The return value will then
21051 // be in the normal return register.
21052 MachineFunction *F = BB->getParent();
21053 const X86InstrInfo *TII =
21054 static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
21055 DebugLoc DL = MI->getDebugLoc();
21056
21057 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?")((Subtarget->isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->isTargetDarwin() && \"Darwin only instr emitted?\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21057, __PRETTY_FUNCTION__))
;
21058 assert(MI->getOperand(3).isGlobal() && "This should be a global")((MI->getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI->getOperand(3).isGlobal() && \"This should be a global\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21058, __PRETTY_FUNCTION__))
;
21059
21060 // Get a register mask for the lowered call.
21061 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21062 // proper register mask.
21063 const uint32_t *RegMask = F->getTarget()
21064 .getSubtargetImpl()
21065 ->getRegisterInfo()
21066 ->getCallPreservedMask(CallingConv::C);
21067 if (Subtarget->is64Bit()) {
21068 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21069 TII->get(X86::MOV64rm), X86::RDI)
21070 .addReg(X86::RIP)
21071 .addImm(0).addReg(0)
21072 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21073 MI->getOperand(3).getTargetFlags())
21074 .addReg(0);
21075 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21076 addDirectMem(MIB, X86::RDI);
21077 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21078 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21079 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21080 TII->get(X86::MOV32rm), X86::EAX)
21081 .addReg(0)
21082 .addImm(0).addReg(0)
21083 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21084 MI->getOperand(3).getTargetFlags())
21085 .addReg(0);
21086 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21087 addDirectMem(MIB, X86::EAX);
21088 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21089 } else {
21090 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21091 TII->get(X86::MOV32rm), X86::EAX)
21092 .addReg(TII->getGlobalBaseReg(F))
21093 .addImm(0).addReg(0)
21094 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21095 MI->getOperand(3).getTargetFlags())
21096 .addReg(0);
21097 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21098 addDirectMem(MIB, X86::EAX);
21099 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21100 }
21101
21102 MI->eraseFromParent(); // The pseudo instruction is gone now.
21103 return BB;
21104}
21105
21106MachineBasicBlock *
21107X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21108 MachineBasicBlock *MBB) const {
21109 DebugLoc DL = MI->getDebugLoc();
21110 MachineFunction *MF = MBB->getParent();
21111 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21112 MachineRegisterInfo &MRI = MF->getRegInfo();
21113
21114 const BasicBlock *BB = MBB->getBasicBlock();
21115 MachineFunction::iterator I = MBB;
21116 ++I;
21117
21118 // Memory Reference
21119 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21120 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21121
21122 unsigned DstReg;
21123 unsigned MemOpndSlot = 0;
21124
21125 unsigned CurOp = 0;
21126
21127 DstReg = MI->getOperand(CurOp++).getReg();
21128 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21129 assert(RC->hasType(MVT::i32) && "Invalid destination!")((RC->hasType(MVT::i32) && "Invalid destination!")
? static_cast<void> (0) : __assert_fail ("RC->hasType(MVT::i32) && \"Invalid destination!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21129, __PRETTY_FUNCTION__))
;
21130 unsigned mainDstReg = MRI.createVirtualRegister(RC);
21131 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21132
21133 MemOpndSlot = CurOp;
21134
21135 MVT PVT = getPointerTy();
21136 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21137, __PRETTY_FUNCTION__))
21137 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21137, __PRETTY_FUNCTION__))
;
21138
21139 // For v = setjmp(buf), we generate
21140 //
21141 // thisMBB:
21142 // buf[LabelOffset] = restoreMBB
21143 // SjLjSetup restoreMBB
21144 //
21145 // mainMBB:
21146 // v_main = 0
21147 //
21148 // sinkMBB:
21149 // v = phi(main, restore)
21150 //
21151 // restoreMBB:
21152 // if base pointer being used, load it from frame
21153 // v_restore = 1
21154
21155 MachineBasicBlock *thisMBB = MBB;
21156 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21157 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21158 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21159 MF->insert(I, mainMBB);
21160 MF->insert(I, sinkMBB);
21161 MF->push_back(restoreMBB);
21162
21163 MachineInstrBuilder MIB;
21164
21165 // Transfer the remainder of BB and its successor edges to sinkMBB.
21166 sinkMBB->splice(sinkMBB->begin(), MBB,
21167 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21168 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21169
21170 // thisMBB:
21171 unsigned PtrStoreOpc = 0;
21172 unsigned LabelReg = 0;
21173 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21174 Reloc::Model RM = MF->getTarget().getRelocationModel();
21175 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21176 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21177
21178 // Prepare IP either in reg or imm.
21179 if (!UseImmLabel) {
21180 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21181 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21182 LabelReg = MRI.createVirtualRegister(PtrRC);
21183 if (Subtarget->is64Bit()) {
21184 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21185 .addReg(X86::RIP)
21186 .addImm(0)
21187 .addReg(0)
21188 .addMBB(restoreMBB)
21189 .addReg(0);
21190 } else {
21191 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21192 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21193 .addReg(XII->getGlobalBaseReg(MF))
21194 .addImm(0)
21195 .addReg(0)
21196 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21197 .addReg(0);
21198 }
21199 } else
21200 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21201 // Store IP
21202 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21203 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21204 if (i == X86::AddrDisp)
21205 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21206 else
21207 MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21208 }
21209 if (!UseImmLabel)
21210 MIB.addReg(LabelReg);
21211 else
21212 MIB.addMBB(restoreMBB);
21213 MIB.setMemRefs(MMOBegin, MMOEnd);
21214 // Setup
21215 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21216 .addMBB(restoreMBB);
21217
21218 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21219 MF->getSubtarget().getRegisterInfo());
21220 MIB.addRegMask(RegInfo->getNoPreservedMask());
21221 thisMBB->addSuccessor(mainMBB);
21222 thisMBB->addSuccessor(restoreMBB);
21223
21224 // mainMBB:
21225 // EAX = 0
21226 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21227 mainMBB->addSuccessor(sinkMBB);
21228
21229 // sinkMBB:
21230 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21231 TII->get(X86::PHI), DstReg)
21232 .addReg(mainDstReg).addMBB(mainMBB)
21233 .addReg(restoreDstReg).addMBB(restoreMBB);
21234
21235 // restoreMBB:
21236 if (RegInfo->hasBasePointer(*MF)) {
21237 const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
21238 const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
21239 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21240 X86FI->setRestoreBasePointer(MF);
21241 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21242 unsigned BasePtr = RegInfo->getBaseRegister();
21243 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21244 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21245 FramePtr, true, X86FI->getRestoreBasePointerOffset())
21246 .setMIFlag(MachineInstr::FrameSetup);
21247 }
21248 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21249 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21250 restoreMBB->addSuccessor(sinkMBB);
21251
21252 MI->eraseFromParent();
21253 return sinkMBB;
21254}
21255
21256MachineBasicBlock *
21257X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21258 MachineBasicBlock *MBB) const {
21259 DebugLoc DL = MI->getDebugLoc();
21260 MachineFunction *MF = MBB->getParent();
21261 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21262 MachineRegisterInfo &MRI = MF->getRegInfo();
21263
21264 // Memory Reference
21265 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21266 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21267
21268 MVT PVT = getPointerTy();
21269 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21270, __PRETTY_FUNCTION__))
21270 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21270, __PRETTY_FUNCTION__))
;
21271
21272 const TargetRegisterClass *RC =
21273 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21274 unsigned Tmp = MRI.createVirtualRegister(RC);
21275 // Since FP is only updated here but NOT referenced, it's treated as GPR.
21276 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21277 MF->getSubtarget().getRegisterInfo());
21278 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21279 unsigned SP = RegInfo->getStackRegister();
21280
21281 MachineInstrBuilder MIB;
21282
21283 const int64_t LabelOffset = 1 * PVT.getStoreSize();
21284 const int64_t SPOffset = 2 * PVT.getStoreSize();
21285
21286 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21287 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21288
21289 // Reload FP
21290 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21291 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21292 MIB.addOperand(MI->getOperand(i));
21293 MIB.setMemRefs(MMOBegin, MMOEnd);
21294 // Reload IP
21295 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21296 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21297 if (i == X86::AddrDisp)
21298 MIB.addDisp(MI->getOperand(i), LabelOffset);
21299 else
21300 MIB.addOperand(MI->getOperand(i));
21301 }
21302 MIB.setMemRefs(MMOBegin, MMOEnd);
21303 // Reload SP
21304 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21305 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21306 if (i == X86::AddrDisp)
21307 MIB.addDisp(MI->getOperand(i), SPOffset);
21308 else
21309 MIB.addOperand(MI->getOperand(i));
21310 }
21311 MIB.setMemRefs(MMOBegin, MMOEnd);
21312 // Jump
21313 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21314
21315 MI->eraseFromParent();
21316 return MBB;
21317}
21318
21319// Replace 213-type (isel default) FMA3 instructions with 231-type for
21320// accumulator loops. Writing back to the accumulator allows the coalescer
21321// to remove extra copies in the loop.
21322MachineBasicBlock *
21323X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21324 MachineBasicBlock *MBB) const {
21325 MachineOperand &AddendOp = MI->getOperand(3);
21326
21327 // Bail out early if the addend isn't a register - we can't switch these.
21328 if (!AddendOp.isReg())
21329 return MBB;
21330
21331 MachineFunction &MF = *MBB->getParent();
21332 MachineRegisterInfo &MRI = MF.getRegInfo();
21333
21334 // Check whether the addend is defined by a PHI:
21335 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?")((MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"
) ? static_cast<void> (0) : __assert_fail ("MRI.hasOneDef(AddendOp.getReg()) && \"Multiple defs in SSA?\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21335, __PRETTY_FUNCTION__))
;
21336 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21337 if (!AddendDef.isPHI())
21338 return MBB;
21339
21340 // Look for the following pattern:
21341 // loop:
21342 // %addend = phi [%entry, 0], [%loop, %result]
21343 // ...
21344 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21345
21346 // Replace with:
21347 // loop:
21348 // %addend = phi [%entry, 0], [%loop, %result]
21349 // ...
21350 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21351
21352 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21353 assert(AddendDef.getOperand(i).isReg())((AddendDef.getOperand(i).isReg()) ? static_cast<void> (
0) : __assert_fail ("AddendDef.getOperand(i).isReg()", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21353, __PRETTY_FUNCTION__))
;
21354 MachineOperand PHISrcOp = AddendDef.getOperand(i);
21355 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21356 if (&PHISrcInst == MI) {
21357 // Found a matching instruction.
21358 unsigned NewFMAOpc = 0;
21359 switch (MI->getOpcode()) {
21360 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21361 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21362 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21363 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21364 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21365 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21366 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21367 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21368 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21369 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21370 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21371 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21372 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21373 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21374 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21375 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21376 case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21377 case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21378 case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21379 case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21380
21381 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21382 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21383 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21384 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21385 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21386 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21387 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21388 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21389 case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21390 case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21391 case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21392 case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21393 default: llvm_unreachable("Unrecognized FMA variant.")::llvm::llvm_unreachable_internal("Unrecognized FMA variant."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21393)
;
21394 }
21395
21396 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
21397 MachineInstrBuilder MIB =
21398 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21399 .addOperand(MI->getOperand(0))
21400 .addOperand(MI->getOperand(3))
21401 .addOperand(MI->getOperand(2))
21402 .addOperand(MI->getOperand(1));
21403 MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21404 MI->eraseFromParent();
21405 }
21406 }
21407
21408 return MBB;
21409}
21410
21411MachineBasicBlock *
21412X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21413 MachineBasicBlock *BB) const {
21414 switch (MI->getOpcode()) {
21415 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21415)
;
21416 case X86::TAILJMPd64:
21417 case X86::TAILJMPr64:
21418 case X86::TAILJMPm64:
21419 case X86::TAILJMPd64_REX:
21420 case X86::TAILJMPr64_REX:
21421 case X86::TAILJMPm64_REX:
21422 llvm_unreachable("TAILJMP64 would not be touched here.")::llvm::llvm_unreachable_internal("TAILJMP64 would not be touched here."
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21422)
;
21423 case X86::TCRETURNdi64:
21424 case X86::TCRETURNri64:
21425 case X86::TCRETURNmi64:
21426 return BB;
21427 case X86::WIN_ALLOCA:
21428 return EmitLoweredWinAlloca(MI, BB);
21429 case X86::SEG_ALLOCA_32:
21430 case X86::SEG_ALLOCA_64:
21431 return EmitLoweredSegAlloca(MI, BB);
21432 case X86::TLSCall_32:
21433 case X86::TLSCall_64:
21434 return EmitLoweredTLSCall(MI, BB);
21435 case X86::CMOV_GR8:
21436 case X86::CMOV_FR32:
21437 case X86::CMOV_FR64:
21438 case X86::CMOV_V4F32:
21439 case X86::CMOV_V2F64:
21440 case X86::CMOV_V2I64:
21441 case X86::CMOV_V8F32:
21442 case X86::CMOV_V4F64:
21443 case X86::CMOV_V4I64:
21444 case X86::CMOV_V16F32:
21445 case X86::CMOV_V8F64:
21446 case X86::CMOV_V8I64:
21447 case X86::CMOV_GR16:
21448 case X86::CMOV_GR32:
21449 case X86::CMOV_RFP32:
21450 case X86::CMOV_RFP64:
21451 case X86::CMOV_RFP80:
21452 return EmitLoweredSelect(MI, BB);
21453
21454 case X86::FP32_TO_INT16_IN_MEM:
21455 case X86::FP32_TO_INT32_IN_MEM:
21456 case X86::FP32_TO_INT64_IN_MEM:
21457 case X86::FP64_TO_INT16_IN_MEM:
21458 case X86::FP64_TO_INT32_IN_MEM:
21459 case X86::FP64_TO_INT64_IN_MEM:
21460 case X86::FP80_TO_INT16_IN_MEM:
21461 case X86::FP80_TO_INT32_IN_MEM:
21462 case X86::FP80_TO_INT64_IN_MEM: {
21463 MachineFunction *F = BB->getParent();
21464 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
21465 DebugLoc DL = MI->getDebugLoc();
21466
21467 // Change the floating point control register to use "round towards zero"
21468 // mode when truncating to an integer value.
21469 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21470 addFrameReference(BuildMI(*BB, MI, DL,
21471 TII->get(X86::FNSTCW16m)), CWFrameIdx);
21472
21473 // Load the old value of the high byte of the control word...
21474 unsigned OldCW =
21475 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21476 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21477 CWFrameIdx);
21478
21479 // Set the high part to be round to zero...
21480 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21481 .addImm(0xC7F);
21482
21483 // Reload the modified control word now...
21484 addFrameReference(BuildMI(*BB, MI, DL,
21485 TII->get(X86::FLDCW16m)), CWFrameIdx);
21486
21487 // Restore the memory image of control word to original value
21488 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21489 .addReg(OldCW);
21490
21491 // Get the X86 opcode to use.
21492 unsigned Opc;
21493 switch (MI->getOpcode()) {
21494 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21494)
;
21495 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21496 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21497 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21498 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21499 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21500 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21501 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21502 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21503 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21504 }
21505
21506 X86AddressMode AM;
21507 MachineOperand &Op = MI->getOperand(0);
21508 if (Op.isReg()) {
21509 AM.BaseType = X86AddressMode::RegBase;
21510 AM.Base.Reg = Op.getReg();
21511 } else {
21512 AM.BaseType = X86AddressMode::FrameIndexBase;
21513 AM.Base.FrameIndex = Op.getIndex();
21514 }
21515 Op = MI->getOperand(1);
21516 if (Op.isImm())
21517 AM.Scale = Op.getImm();
21518 Op = MI->getOperand(2);
21519 if (Op.isImm())
21520 AM.IndexReg = Op.getImm();
21521 Op = MI->getOperand(3);
21522 if (Op.isGlobal()) {
21523 AM.GV = Op.getGlobal();
21524 } else {
21525 AM.Disp = Op.getImm();
21526 }
21527 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21528 .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21529
21530 // Reload the original control word now.
21531 addFrameReference(BuildMI(*BB, MI, DL,
21532 TII->get(X86::FLDCW16m)), CWFrameIdx);
21533
21534 MI->eraseFromParent(); // The pseudo instruction is gone now.
21535 return BB;
21536 }
21537 // String/text processing lowering.
21538 case X86::PCMPISTRM128REG:
21539 case X86::VPCMPISTRM128REG:
21540 case X86::PCMPISTRM128MEM:
21541 case X86::VPCMPISTRM128MEM:
21542 case X86::PCMPESTRM128REG:
21543 case X86::VPCMPESTRM128REG:
21544 case X86::PCMPESTRM128MEM:
21545 case X86::VPCMPESTRM128MEM:
21546 assert(Subtarget->hasSSE42() &&((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21547, __PRETTY_FUNCTION__))
21547 "Target must have SSE4.2 or AVX features enabled")((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21547, __PRETTY_FUNCTION__))
;
21548 return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21549
21550 // String/text processing lowering.
21551 case X86::PCMPISTRIREG:
21552 case X86::VPCMPISTRIREG:
21553 case X86::PCMPISTRIMEM:
21554 case X86::VPCMPISTRIMEM:
21555 case X86::PCMPESTRIREG:
21556 case X86::VPCMPESTRIREG:
21557 case X86::PCMPESTRIMEM:
21558 case X86::VPCMPESTRIMEM:
21559 assert(Subtarget->hasSSE42() &&((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21560, __PRETTY_FUNCTION__))
21560 "Target must have SSE4.2 or AVX features enabled")((Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? static_cast<void> (0) : __assert_fail ("Subtarget->hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21560, __PRETTY_FUNCTION__))
;
21561 return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21562
21563 // Thread synchronization.
21564 case X86::MONITOR:
21565 return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
21566 Subtarget);
21567
21568 // xbegin
21569 case X86::XBEGIN:
21570 return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21571
21572 case X86::VASTART_SAVE_XMM_REGS:
21573 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21574
21575 case X86::VAARG_64:
21576 return EmitVAARG64WithCustomInserter(MI, BB);
21577
21578 case X86::EH_SjLj_SetJmp32:
21579 case X86::EH_SjLj_SetJmp64:
21580 return emitEHSjLjSetJmp(MI, BB);
21581
21582 case X86::EH_SjLj_LongJmp32:
21583 case X86::EH_SjLj_LongJmp64:
21584 return emitEHSjLjLongJmp(MI, BB);
21585
21586 case TargetOpcode::STATEPOINT:
21587 // As an implementation detail, STATEPOINT shares the STACKMAP format at
21588 // this point in the process. We diverge later.
21589 return emitPatchPoint(MI, BB);
21590
21591 case TargetOpcode::STACKMAP:
21592 case TargetOpcode::PATCHPOINT:
21593 return emitPatchPoint(MI, BB);
21594
21595 case X86::VFMADDPDr213r:
21596 case X86::VFMADDPSr213r:
21597 case X86::VFMADDSDr213r:
21598 case X86::VFMADDSSr213r:
21599 case X86::VFMSUBPDr213r:
21600 case X86::VFMSUBPSr213r:
21601 case X86::VFMSUBSDr213r:
21602 case X86::VFMSUBSSr213r:
21603 case X86::VFNMADDPDr213r:
21604 case X86::VFNMADDPSr213r:
21605 case X86::VFNMADDSDr213r:
21606 case X86::VFNMADDSSr213r:
21607 case X86::VFNMSUBPDr213r:
21608 case X86::VFNMSUBPSr213r:
21609 case X86::VFNMSUBSDr213r:
21610 case X86::VFNMSUBSSr213r:
21611 case X86::VFMADDSUBPDr213r:
21612 case X86::VFMADDSUBPSr213r:
21613 case X86::VFMSUBADDPDr213r:
21614 case X86::VFMSUBADDPSr213r:
21615 case X86::VFMADDPDr213rY:
21616 case X86::VFMADDPSr213rY:
21617 case X86::VFMSUBPDr213rY:
21618 case X86::VFMSUBPSr213rY:
21619 case X86::VFNMADDPDr213rY:
21620 case X86::VFNMADDPSr213rY:
21621 case X86::VFNMSUBPDr213rY:
21622 case X86::VFNMSUBPSr213rY:
21623 case X86::VFMADDSUBPDr213rY:
21624 case X86::VFMADDSUBPSr213rY:
21625 case X86::VFMSUBADDPDr213rY:
21626 case X86::VFMSUBADDPSr213rY:
21627 return emitFMA3Instr(MI, BB);
21628 }
21629}
21630
21631//===----------------------------------------------------------------------===//
21632// X86 Optimization Hooks
21633//===----------------------------------------------------------------------===//
21634
21635void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21636 APInt &KnownZero,
21637 APInt &KnownOne,
21638 const SelectionDAG &DAG,
21639 unsigned Depth) const {
21640 unsigned BitWidth = KnownZero.getBitWidth();
21641 unsigned Opc = Op.getOpcode();
21642 assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
21643 Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
21644 Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
21645 Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
21646 "Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
21647 " is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21647, __PRETTY_FUNCTION__))
;
21648
21649 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
21650 switch (Opc) {
21651 default: break;
21652 case X86ISD::ADD:
21653 case X86ISD::SUB:
21654 case X86ISD::ADC:
21655 case X86ISD::SBB:
21656 case X86ISD::SMUL:
21657 case X86ISD::UMUL:
21658 case X86ISD::INC:
21659 case X86ISD::DEC:
21660 case X86ISD::OR:
21661 case X86ISD::XOR:
21662 case X86ISD::AND:
21663 // These nodes' second result is a boolean.
21664 if (Op.getResNo() == 0)
21665 break;
21666 // Fallthrough
21667 case X86ISD::SETCC:
21668 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21669 break;
21670 case ISD::INTRINSIC_WO_CHAIN: {
21671 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21672 unsigned NumLoBits = 0;
21673 switch (IntId) {
21674 default: break;
21675 case Intrinsic::x86_sse_movmsk_ps:
21676 case Intrinsic::x86_avx_movmsk_ps_256:
21677 case Intrinsic::x86_sse2_movmsk_pd:
21678 case Intrinsic::x86_avx_movmsk_pd_256:
21679 case Intrinsic::x86_mmx_pmovmskb:
21680 case Intrinsic::x86_sse2_pmovmskb_128:
21681 case Intrinsic::x86_avx2_pmovmskb: {
21682 // High bits of movmskp{s|d}, pmovmskb are known zero.
21683 switch (IntId) {
21684 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21684)
; // Can't reach here.
21685 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
21686 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
21687 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
21688 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
21689 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
21690 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
21691 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
21692 }
21693 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21694 break;
21695 }
21696 }
21697 break;
21698 }
21699 }
21700}
21701
21702unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21703 SDValue Op,
21704 const SelectionDAG &,
21705 unsigned Depth) const {
21706 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21707 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21708 return Op.getValueType().getScalarType().getSizeInBits();
21709
21710 // Fallback case.
21711 return 1;
21712}
21713
21714/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21715/// node is a GlobalAddress + offset.
21716bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21717 const GlobalValue* &GA,
21718 int64_t &Offset) const {
21719 if (N->getOpcode() == X86ISD::Wrapper) {
21720 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21721 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21722 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21723 return true;
21724 }
21725 }
21726 return TargetLowering::isGAPlusOffset(N, GA, Offset);
21727}
21728
21729/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21730/// same as extracting the high 128-bit part of 256-bit vector and then
21731/// inserting the result into the low part of a new 256-bit vector
21732static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21733 EVT VT = SVOp->getValueType(0);
21734 unsigned NumElems = VT.getVectorNumElements();
21735
21736 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21737 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21738 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21739 SVOp->getMaskElt(j) >= 0)
21740 return false;
21741
21742 return true;
21743}
21744
21745/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21746/// same as extracting the low 128-bit part of 256-bit vector and then
21747/// inserting the result into the high part of a new 256-bit vector
21748static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21749 EVT VT = SVOp->getValueType(0);
21750 unsigned NumElems = VT.getVectorNumElements();
21751
21752 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21753 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21754 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21755 SVOp->getMaskElt(j) >= 0)
21756 return false;
21757
21758 return true;
21759}
21760
21761/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21762static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21763 TargetLowering::DAGCombinerInfo &DCI,
21764 const X86Subtarget* Subtarget) {
21765 SDLoc dl(N);
21766 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21767 SDValue V1 = SVOp->getOperand(0);
21768 SDValue V2 = SVOp->getOperand(1);
21769 EVT VT = SVOp->getValueType(0);
21770 unsigned NumElems = VT.getVectorNumElements();
21771
21772 if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21773 V2.getOpcode() == ISD::CONCAT_VECTORS) {
21774 //
21775 // 0,0,0,...
21776 // |
21777 // V UNDEF BUILD_VECTOR UNDEF
21778 // \ / \ /
21779 // CONCAT_VECTOR CONCAT_VECTOR
21780 // \ /
21781 // \ /
21782 // RESULT: V + zero extended
21783 //
21784 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21785 V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21786 V1.getOperand(1).getOpcode() != ISD::UNDEF)
21787 return SDValue();
21788
21789 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21790 return SDValue();
21791
21792 // To match the shuffle mask, the first half of the mask should
21793 // be exactly the first vector, and all the rest a splat with the
21794 // first element of the second one.
21795 for (unsigned i = 0; i != NumElems/2; ++i)
21796 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
21797 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
21798 return SDValue();
21799
21800 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
21801 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
21802 if (Ld->hasNUsesOfValue(1, 0)) {
21803 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
21804 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
21805 SDValue ResNode =
21806 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
21807 Ld->getMemoryVT(),
21808 Ld->getPointerInfo(),
21809 Ld->getAlignment(),
21810 false/*isVolatile*/, true/*ReadMem*/,
21811 false/*WriteMem*/);
21812
21813 // Make sure the newly-created LOAD is in the same position as Ld in
21814 // terms of dependency. We create a TokenFactor for Ld and ResNode,
21815 // and update uses of Ld's output chain to use the TokenFactor.
21816 if (Ld->hasAnyUseOfValue(1)) {
21817 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
21818 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
21819 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
21820 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
21821 SDValue(ResNode.getNode(), 1));
21822 }
21823
21824 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
21825 }
21826 }
21827
21828 // Emit a zeroed vector and insert the desired subvector on its
21829 // first half.
21830 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21831 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
21832 return DCI.CombineTo(N, InsV);
21833 }
21834
21835 //===--------------------------------------------------------------------===//
21836 // Combine some shuffles into subvector extracts and inserts:
21837 //
21838
21839 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21840 if (isShuffleHigh128VectorInsertLow(SVOp)) {
21841 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
21842 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
21843 return DCI.CombineTo(N, InsV);
21844 }
21845
21846 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21847 if (isShuffleLow128VectorInsertHigh(SVOp)) {
21848 SDValue V = Extract128BitVector(V1, 0, DAG, dl);
21849 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
21850 return DCI.CombineTo(N, InsV);
21851 }
21852
21853 return SDValue();
21854}
21855
21856/// \brief Combine an arbitrary chain of shuffles into a single instruction if
21857/// possible.
21858///
21859/// This is the leaf of the recursive combinine below. When we have found some
21860/// chain of single-use x86 shuffle instructions and accumulated the combined
21861/// shuffle mask represented by them, this will try to pattern match that mask
21862/// into either a single instruction if there is a special purpose instruction
21863/// for this operation, or into a PSHUFB instruction which is a fully general
21864/// instruction but should only be used to replace chains over a certain depth.
21865static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
21866 int Depth, bool HasPSHUFB, SelectionDAG &DAG,
21867 TargetLowering::DAGCombinerInfo &DCI,
21868 const X86Subtarget *Subtarget) {
21869 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!")((!Mask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!Mask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21869, __PRETTY_FUNCTION__))
;
21870
21871 // Find the operand that enters the chain. Note that multiple uses are OK
21872 // here, we're not going to remove the operand we find.
21873 SDValue Input = Op.getOperand(0);
21874 while (Input.getOpcode() == ISD::BITCAST)
21875 Input = Input.getOperand(0);
21876
21877 MVT VT = Input.getSimpleValueType();
21878 MVT RootVT = Root.getSimpleValueType();
21879 SDLoc DL(Root);
21880
21881 // Just remove no-op shuffle masks.
21882 if (Mask.size() == 1) {
21883 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
21884 /*AddTo*/ true);
21885 return true;
21886 }
21887
21888 // Use the float domain if the operand type is a floating point type.
21889 bool FloatDomain = VT.isFloatingPoint();
21890
21891 // For floating point shuffles, we don't have free copies in the shuffle
21892 // instructions or the ability to load as part of the instruction, so
21893 // canonicalize their shuffles to UNPCK or MOV variants.
21894 //
21895 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
21896 // vectors because it can have a load folded into it that UNPCK cannot. This
21897 // doesn't preclude something switching to the shorter encoding post-RA.
21898 if (FloatDomain) {
21899 if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
21900 bool Lo = Mask.equals(0, 0);
21901 unsigned Shuffle;
21902 MVT ShuffleVT;
21903 // Check if we have SSE3 which will let us use MOVDDUP. That instruction
21904 // is no slower than UNPCKLPD but has the option to fold the input operand
21905 // into even an unaligned memory load.
21906 if (Lo && Subtarget->hasSSE3()) {
21907 Shuffle = X86ISD::MOVDDUP;
21908 ShuffleVT = MVT::v2f64;
21909 } else {
21910 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
21911 // than the UNPCK variants.
21912 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
21913 ShuffleVT = MVT::v4f32;
21914 }
21915 if (Depth == 1 && Root->getOpcode() == Shuffle)
21916 return false; // Nothing to do!
21917 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21918 DCI.AddToWorklist(Op.getNode());
21919 if (Shuffle == X86ISD::MOVDDUP)
21920 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21921 else
21922 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21923 DCI.AddToWorklist(Op.getNode());
21924 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21925 /*AddTo*/ true);
21926 return true;
21927 }
21928 if (Subtarget->hasSSE3() &&
21929 (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
21930 bool Lo = Mask.equals(0, 0, 2, 2);
21931 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
21932 MVT ShuffleVT = MVT::v4f32;
21933 if (Depth == 1 && Root->getOpcode() == Shuffle)
21934 return false; // Nothing to do!
21935 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21936 DCI.AddToWorklist(Op.getNode());
21937 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21938 DCI.AddToWorklist(Op.getNode());
21939 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21940 /*AddTo*/ true);
21941 return true;
21942 }
21943 if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
21944 bool Lo = Mask.equals(0, 0, 1, 1);
21945 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
21946 MVT ShuffleVT = MVT::v4f32;
21947 if (Depth == 1 && Root->getOpcode() == Shuffle)
21948 return false; // Nothing to do!
21949 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21950 DCI.AddToWorklist(Op.getNode());
21951 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21952 DCI.AddToWorklist(Op.getNode());
21953 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21954 /*AddTo*/ true);
21955 return true;
21956 }
21957 }
21958
21959 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
21960 // variants as none of these have single-instruction variants that are
21961 // superior to the UNPCK formulation.
21962 if (!FloatDomain &&
21963 (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
21964 Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
21965 Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
21966 Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
21967 15))) {
21968 bool Lo = Mask[0] == 0;
21969 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
21970 if (Depth == 1 && Root->getOpcode() == Shuffle)
21971 return false; // Nothing to do!
21972 MVT ShuffleVT;
21973 switch (Mask.size()) {
21974 case 8:
21975 ShuffleVT = MVT::v8i16;
21976 break;
21977 case 16:
21978 ShuffleVT = MVT::v16i8;
21979 break;
21980 default:
21981 llvm_unreachable("Impossible mask size!")::llvm::llvm_unreachable_internal("Impossible mask size!", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 21981)
;
21982 };
21983 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21984 DCI.AddToWorklist(Op.getNode());
21985 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21986 DCI.AddToWorklist(Op.getNode());
21987 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21988 /*AddTo*/ true);
21989 return true;
21990 }
21991
21992 // Don't try to re-form single instruction chains under any circumstances now
21993 // that we've done encoding canonicalization for them.
21994 if (Depth < 2)
21995 return false;
21996
21997 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
21998 // can replace them with a single PSHUFB instruction profitably. Intel's
21999 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22000 // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22001 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22002 SmallVector<SDValue, 16> PSHUFBMask;
22003 assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!")((Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can't shuffle elements smaller than bytes!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22003, __PRETTY_FUNCTION__))
;
22004 int Ratio = 16 / Mask.size();
22005 for (unsigned i = 0; i < 16; ++i) {
22006 if (Mask[i / Ratio] == SM_SentinelUndef) {
22007 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22008 continue;
22009 }
22010 int M = Mask[i / Ratio] != SM_SentinelZero
22011 ? Ratio * Mask[i / Ratio] + i % Ratio
22012 : 255;
22013 PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22014 }
22015 Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22016 DCI.AddToWorklist(Op.getNode());
22017 SDValue PSHUFBMaskOp =
22018 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22019 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22020 Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22021 DCI.AddToWorklist(Op.getNode());
22022 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22023 /*AddTo*/ true);
22024 return true;
22025 }
22026
22027 // Failed to find any combines.
22028 return false;
22029}
22030
22031/// \brief Fully generic combining of x86 shuffle instructions.
22032///
22033/// This should be the last combine run over the x86 shuffle instructions. Once
22034/// they have been fully optimized, this will recursively consider all chains
22035/// of single-use shuffle instructions, build a generic model of the cumulative
22036/// shuffle operation, and check for simpler instructions which implement this
22037/// operation. We use this primarily for two purposes:
22038///
22039/// 1) Collapse generic shuffles to specialized single instructions when
22040/// equivalent. In most cases, this is just an encoding size win, but
22041/// sometimes we will collapse multiple generic shuffles into a single
22042/// special-purpose shuffle.
22043/// 2) Look for sequences of shuffle instructions with 3 or more total
22044/// instructions, and replace them with the slightly more expensive SSSE3
22045/// PSHUFB instruction if available. We do this as the last combining step
22046/// to ensure we avoid using PSHUFB if we can implement the shuffle with
22047/// a suitable short sequence of other instructions. The PHUFB will either
22048/// use a register or have to read from memory and so is slightly (but only
22049/// slightly) more expensive than the other shuffle instructions.
22050///
22051/// Because this is inherently a quadratic operation (for each shuffle in
22052/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22053/// This should never be an issue in practice as the shuffle lowering doesn't
22054/// produce sequences of more than 8 instructions.
22055///
22056/// FIXME: We will currently miss some cases where the redundant shuffling
22057/// would simplify under the threshold for PSHUFB formation because of
22058/// combine-ordering. To fix this, we should do the redundant instruction
22059/// combining in this recursive walk.
22060static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22061 ArrayRef<int> RootMask,
22062 int Depth, bool HasPSHUFB,
22063 SelectionDAG &DAG,
22064 TargetLowering::DAGCombinerInfo &DCI,
22065 const X86Subtarget *Subtarget) {
22066 // Bound the depth of our recursive combine because this is ultimately
22067 // quadratic in nature.
22068 if (Depth > 8)
22069 return false;
22070
22071 // Directly rip through bitcasts to find the underlying operand.
22072 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22073 Op = Op.getOperand(0);
22074
22075 MVT VT = Op.getSimpleValueType();
22076 if (!VT.isVector())
22077 return false; // Bail if we hit a non-vector.
22078 // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22079 // version should be added.
22080 if (VT.getSizeInBits() != 128)
22081 return false;
22082
22083 assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22084, __PRETTY_FUNCTION__))
22084 "Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22084, __PRETTY_FUNCTION__))
;
22085 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22086, __PRETTY_FUNCTION__))
22086 "Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22086, __PRETTY_FUNCTION__))
;
22087
22088 if (!isTargetShuffle(Op.getOpcode()))
22089 return false;
22090 SmallVector<int, 16> OpMask;
22091 bool IsUnary;
22092 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22093 // We only can combine unary shuffles which we can decode the mask for.
22094 if (!HaveMask || !IsUnary)
22095 return false;
22096
22097 assert(VT.getVectorNumElements() == OpMask.size() &&((VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == OpMask.size() && \"Different mask size from vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22098, __PRETTY_FUNCTION__))
22098 "Different mask size from vector size!")((VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == OpMask.size() && \"Different mask size from vector size!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22098, __PRETTY_FUNCTION__))
;
22099 assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
22100 RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
22101 (OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
22102 OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
22103 OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
22104 "The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22104, __PRETTY_FUNCTION__))
;
22105 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22106 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22107 assert(((RootRatio == 1 && OpRatio == 1) ||((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__))
22108 (RootRatio == 1) != (OpRatio == 1)) &&((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__))
22109 "Must not have a ratio for both incoming and op masks!")((((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1
) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("((RootRatio == 1 && OpRatio == 1) || (RootRatio == 1) != (OpRatio == 1)) && \"Must not have a ratio for both incoming and op masks!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22109, __PRETTY_FUNCTION__))
;
22110
22111 SmallVector<int, 16> Mask;
22112 Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22113
22114 // Merge this shuffle operation's mask into our accumulated mask. Note that
22115 // this shuffle's mask will be the first applied to the input, followed by the
22116 // root mask to get us all the way to the root value arrangement. The reason
22117 // for this order is that we are recursing up the operation chain.
22118 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22119 int RootIdx = i / RootRatio;
22120 if (RootMask[RootIdx] < 0) {
22121 // This is a zero or undef lane, we're done.
22122 Mask.push_back(RootMask[RootIdx]);
22123 continue;
22124 }
22125
22126 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22127 int OpIdx = RootMaskedIdx / OpRatio;
22128 if (OpMask[OpIdx] < 0) {
22129 // The incoming lanes are zero or undef, it doesn't matter which ones we
22130 // are using.
22131 Mask.push_back(OpMask[OpIdx]);
22132 continue;
22133 }
22134
22135 // Ok, we have non-zero lanes, map them through.
22136 Mask.push_back(OpMask[OpIdx] * OpRatio +
22137 RootMaskedIdx % OpRatio);
22138 }
22139
22140 // See if we can recurse into the operand to combine more things.
22141 switch (Op.getOpcode()) {
22142 case X86ISD::PSHUFB:
22143 HasPSHUFB = true;
22144 case X86ISD::PSHUFD:
22145 case X86ISD::PSHUFHW:
22146 case X86ISD::PSHUFLW:
22147 if (Op.getOperand(0).hasOneUse() &&
22148 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22149 HasPSHUFB, DAG, DCI, Subtarget))
22150 return true;
22151 break;
22152
22153 case X86ISD::UNPCKL:
22154 case X86ISD::UNPCKH:
22155 assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!")((Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0) == Op.getOperand(1) && \"We only combine unary shuffles!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22155, __PRETTY_FUNCTION__))
;
22156 // We can't check for single use, we have to check that this shuffle is the only user.
22157 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22158 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22159 HasPSHUFB, DAG, DCI, Subtarget))
22160 return true;
22161 break;
22162 }
22163
22164 // Minor canonicalization of the accumulated shuffle mask to make it easier
22165 // to match below. All this does is detect masks with squential pairs of
22166 // elements, and shrink them to the half-width mask. It does this in a loop
22167 // so it will reduce the size of the mask to the minimal width mask which
22168 // performs an equivalent shuffle.
22169 SmallVector<int, 16> WidenedMask;
22170 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22171 Mask = std::move(WidenedMask);
22172 WidenedMask.clear();
22173 }
22174
22175 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22176 Subtarget);
22177}
22178
22179/// \brief Get the PSHUF-style mask from PSHUF node.
22180///
22181/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22182/// PSHUF-style masks that can be reused with such instructions.
22183static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22184 SmallVector<int, 4> Mask;
22185 bool IsUnary;
22186 bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22187 (void)HaveMask;
22188 assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22188, __PRETTY_FUNCTION__))
;
22189
22190 switch (N.getOpcode()) {
22191 case X86ISD::PSHUFD:
22192 return Mask;
22193 case X86ISD::PSHUFLW:
22194 Mask.resize(4);
22195 return Mask;
22196 case X86ISD::PSHUFHW:
22197 Mask.erase(Mask.begin(), Mask.begin() + 4);
22198 for (int &M : Mask)
22199 M -= 4;
22200 return Mask;
22201 default:
22202 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22202)
;
22203 }
22204}
22205
22206/// \brief Search for a combinable shuffle across a chain ending in pshufd.
22207///
22208/// We walk up the chain and look for a combinable shuffle, skipping over
22209/// shuffles that we could hoist this shuffle's transformation past without
22210/// altering anything.
22211static SDValue
22212combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22213 SelectionDAG &DAG,
22214 TargetLowering::DAGCombinerInfo &DCI) {
22215 assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22216, __PRETTY_FUNCTION__))
22216 "Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22216, __PRETTY_FUNCTION__))
;
22217 SDLoc DL(N);
22218
22219 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22220 // of the shuffles in the chain so that we can form a fresh chain to replace
22221 // this one.
22222 SmallVector<SDValue, 8> Chain;
22223 SDValue V = N.getOperand(0);
22224 for (; V.hasOneUse(); V = V.getOperand(0)) {
22225 switch (V.getOpcode()) {
22226 default:
22227 return SDValue(); // Nothing combined!
22228
22229 case ISD::BITCAST:
22230 // Skip bitcasts as we always know the type for the target specific
22231 // instructions.
22232 continue;
22233
22234 case X86ISD::PSHUFD:
22235 // Found another dword shuffle.
22236 break;
22237
22238 case X86ISD::PSHUFLW:
22239 // Check that the low words (being shuffled) are the identity in the
22240 // dword shuffle, and the high words are self-contained.
22241 if (Mask[0] != 0 || Mask[1] != 1 ||
22242 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22243 return SDValue();
22244
22245 Chain.push_back(V);
22246 continue;
22247
22248 case X86ISD::PSHUFHW:
22249 // Check that the high words (being shuffled) are the identity in the
22250 // dword shuffle, and the low words are self-contained.
22251 if (Mask[2] != 2 || Mask[3] != 3 ||
22252 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22253 return SDValue();
22254
22255 Chain.push_back(V);
22256 continue;
22257
22258 case X86ISD::UNPCKL:
22259 case X86ISD::UNPCKH:
22260 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22261 // shuffle into a preceding word shuffle.
22262 if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22263 return SDValue();
22264
22265 // Search for a half-shuffle which we can combine with.
22266 unsigned CombineOp =
22267 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22268 if (V.getOperand(0) != V.getOperand(1) ||
22269 !V->isOnlyUserOf(V.getOperand(0).getNode()))
22270 return SDValue();
22271 Chain.push_back(V);
22272 V = V.getOperand(0);
22273 do {
22274 switch (V.getOpcode()) {
22275 default:
22276 return SDValue(); // Nothing to combine.
22277
22278 case X86ISD::PSHUFLW:
22279 case X86ISD::PSHUFHW:
22280 if (V.getOpcode() == CombineOp)
22281 break;
22282
22283 Chain.push_back(V);
22284
22285 // Fallthrough!
22286 case ISD::BITCAST:
22287 V = V.getOperand(0);
22288 continue;
22289 }
22290 break;
22291 } while (V.hasOneUse());
22292 break;
22293 }
22294 // Break out of the loop if we break out of the switch.
22295 break;
22296 }
22297
22298 if (!V.hasOneUse())
22299 // We fell out of the loop without finding a viable combining instruction.
22300 return SDValue();
22301
22302 // Merge this node's mask and our incoming mask.
22303 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22304 for (int &M : Mask)
22305 M = VMask[M];
22306 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22307 getV4X86ShuffleImm8ForMask(Mask, DAG));
22308
22309 // Rebuild the chain around this new shuffle.
22310 while (!Chain.empty()) {
22311 SDValue W = Chain.pop_back_val();
22312
22313 if (V.getValueType() != W.getOperand(0).getValueType())
22314 V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22315
22316 switch (W.getOpcode()) {
22317 default:
22318 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22318)
;
22319
22320 case X86ISD::UNPCKL:
22321 case X86ISD::UNPCKH:
22322 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22323 break;
22324
22325 case X86ISD::PSHUFD:
22326 case X86ISD::PSHUFLW:
22327 case X86ISD::PSHUFHW:
22328 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22329 break;
22330 }
22331 }
22332 if (V.getValueType() != N.getValueType())
22333 V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22334
22335 // Return the new chain to replace N.
22336 return V;
22337}
22338
22339/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22340///
22341/// We walk up the chain, skipping shuffles of the other half and looking
22342/// through shuffles which switch halves trying to find a shuffle of the same
22343/// pair of dwords.
22344static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22345 SelectionDAG &DAG,
22346 TargetLowering::DAGCombinerInfo &DCI) {
22347 assert((((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__))
22348 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&(((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__))
22349 "Called with something other than an x86 128-bit half shuffle!")(((N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD
::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22349, __PRETTY_FUNCTION__))
;
22350 SDLoc DL(N);
22351 unsigned CombineOpcode = N.getOpcode();
22352
22353 // Walk up a single-use chain looking for a combinable shuffle.
22354 SDValue V = N.getOperand(0);
22355 for (; V.hasOneUse(); V = V.getOperand(0)) {
22356 switch (V.getOpcode()) {
22357 default:
22358 return false; // Nothing combined!
22359
22360 case ISD::BITCAST:
22361 // Skip bitcasts as we always know the type for the target specific
22362 // instructions.
22363 continue;
22364
22365 case X86ISD::PSHUFLW:
22366 case X86ISD::PSHUFHW:
22367 if (V.getOpcode() == CombineOpcode)
22368 break;
22369
22370 // Other-half shuffles are no-ops.
22371 continue;
22372 }
22373 // Break out of the loop if we break out of the switch.
22374 break;
22375 }
22376
22377 if (!V.hasOneUse())
22378 // We fell out of the loop without finding a viable combining instruction.
22379 return false;
22380
22381 // Combine away the bottom node as its shuffle will be accumulated into
22382 // a preceding shuffle.
22383 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22384
22385 // Record the old value.
22386 SDValue Old = V;
22387
22388 // Merge this node's mask and our incoming mask (adjusted to account for all
22389 // the pshufd instructions encountered).
22390 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22391 for (int &M : Mask)
22392 M = VMask[M];
22393 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22394 getV4X86ShuffleImm8ForMask(Mask, DAG));
22395
22396 // Check that the shuffles didn't cancel each other out. If not, we need to
22397 // combine to the new one.
22398 if (Old != V)
22399 // Replace the combinable shuffle with the combined one, updating all users
22400 // so that we re-evaluate the chain here.
22401 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22402
22403 return true;
22404}
22405
22406/// \brief Try to combine x86 target specific shuffles.
22407static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22408 TargetLowering::DAGCombinerInfo &DCI,
22409 const X86Subtarget *Subtarget) {
22410 SDLoc DL(N);
22411 MVT VT = N.getSimpleValueType();
22412 SmallVector<int, 4> Mask;
22413
22414 switch (N.getOpcode()) {
22415 case X86ISD::PSHUFD:
22416 case X86ISD::PSHUFLW:
22417 case X86ISD::PSHUFHW:
22418 Mask = getPSHUFShuffleMask(N);
22419 assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22419, __PRETTY_FUNCTION__))
;
22420 break;
22421 default:
22422 return SDValue();
22423 }
22424
22425 // Nuke no-op shuffles that show up after combining.
22426 if (isNoopShuffleMask(Mask))
22427 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22428
22429 // Look for simplifications involving one or two shuffle instructions.
22430 SDValue V = N.getOperand(0);
22431 switch (N.getOpcode()) {
22432 default:
22433 break;
22434 case X86ISD::PSHUFLW:
22435 case X86ISD::PSHUFHW:
22436 assert(VT == MVT::v8i16)((VT == MVT::v8i16) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v8i16", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22436, __PRETTY_FUNCTION__))
;
22437 (void)VT;
22438
22439 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22440 return SDValue(); // We combined away this shuffle, so we're done.
22441
22442 // See if this reduces to a PSHUFD which is no more expensive and can
22443 // combine with more operations. Note that it has to at least flip the
22444 // dwords as otherwise it would have been removed as a no-op.
22445 if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22446 int DMask[] = {0, 1, 2, 3};
22447 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22448 DMask[DOffset + 0] = DOffset + 1;
22449 DMask[DOffset + 1] = DOffset + 0;
22450 V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22451 DCI.AddToWorklist(V.getNode());
22452 V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22453 getV4X86ShuffleImm8ForMask(DMask, DAG));
22454 DCI.AddToWorklist(V.getNode());
22455 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22456 }
22457
22458 // Look for shuffle patterns which can be implemented as a single unpack.
22459 // FIXME: This doesn't handle the location of the PSHUFD generically, and
22460 // only works when we have a PSHUFD followed by two half-shuffles.
22461 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22462 (V.getOpcode() == X86ISD::PSHUFLW ||
22463 V.getOpcode() == X86ISD::PSHUFHW) &&
22464 V.getOpcode() != N.getOpcode() &&
22465 V.hasOneUse()) {
22466 SDValue D = V.getOperand(0);
22467 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22468 D = D.getOperand(0);
22469 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22470 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22471 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22472 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22473 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22474 int WordMask[8];
22475 for (int i = 0; i < 4; ++i) {
22476 WordMask[i + NOffset] = Mask[i] + NOffset;
22477 WordMask[i + VOffset] = VMask[i] + VOffset;
22478 }
22479 // Map the word mask through the DWord mask.
22480 int MappedMask[8];
22481 for (int i = 0; i < 8; ++i)
22482 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22483 const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22484 const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22485 if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22486 std::begin(UnpackLoMask)) ||
22487 std::equal(std::begin(MappedMask), std::end(MappedMask),
22488 std::begin(UnpackHiMask))) {
22489 // We can replace all three shuffles with an unpack.
22490 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22491 DCI.AddToWorklist(V.getNode());
22492 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22493 : X86ISD::UNPCKH,
22494 DL, MVT::v8i16, V, V);
22495 }
22496 }
22497 }
22498
22499 break;
22500
22501 case X86ISD::PSHUFD:
22502 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22503 return NewN;
22504
22505 break;
22506 }
22507
22508 return SDValue();
22509}
22510
22511/// \brief Try to combine a shuffle into a target-specific add-sub node.
22512///
22513/// We combine this directly on the abstract vector shuffle nodes so it is
22514/// easier to generically match. We also insert dummy vector shuffle nodes for
22515/// the operands which explicitly discard the lanes which are unused by this
22516/// operation to try to flow through the rest of the combiner the fact that
22517/// they're unused.
22518static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22519 SDLoc DL(N);
22520 EVT VT = N->getValueType(0);
22521
22522 // We only handle target-independent shuffles.
22523 // FIXME: It would be easy and harmless to use the target shuffle mask
22524 // extraction tool to support more.
22525 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22526 return SDValue();
22527
22528 auto *SVN = cast<ShuffleVectorSDNode>(N);
22529 ArrayRef<int> Mask = SVN->getMask();
22530 SDValue V1 = N->getOperand(0);
22531 SDValue V2 = N->getOperand(1);
22532
22533 // We require the first shuffle operand to be the SUB node, and the second to
22534 // be the ADD node.
22535 // FIXME: We should support the commuted patterns.
22536 if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22537 return SDValue();
22538
22539 // If there are other uses of these operations we can't fold them.
22540 if (!V1->hasOneUse() || !V2->hasOneUse())
22541 return SDValue();
22542
22543 // Ensure that both operations have the same operands. Note that we can
22544 // commute the FADD operands.
22545 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22546 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22547 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22548 return SDValue();
22549
22550 // We're looking for blends between FADD and FSUB nodes. We insist on these
22551 // nodes being lined up in a specific expected pattern.
22552 if (!(isShuffleEquivalent(Mask, 0, 3) ||
22553 isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22554 isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22555 return SDValue();
22556
22557 // Only specific types are legal at this point, assert so we notice if and
22558 // when these change.
22559 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__))
22560 VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__))
22561 "Unknown vector type encountered!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
VT == MVT::v4f64) && "Unknown vector type encountered!"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unknown vector type encountered!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 22561, __PRETTY_FUNCTION__))
;
22562
22563 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22564}
22565
22566/// PerformShuffleCombine - Performs several different shuffle combines.
22567static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22568 TargetLowering::DAGCombinerInfo &DCI,
22569 const X86Subtarget *Subtarget) {
22570 SDLoc dl(N);
22571 SDValue N0 = N->getOperand(0);
22572 SDValue N1 = N->getOperand(1);
22573 EVT VT = N->getValueType(0);
22574
22575 // Don't create instructions with illegal types after legalize types has run.
22576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22577 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
1
Taking false branch
22578 return SDValue();
22579
22580 // If we have legalized the vector types, look for blends of FADD and FSUB
22581 // nodes that we can fuse into an ADDSUB node.
22582 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22583 if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22584 return AddSub;
22585
22586 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22587 if (Subtarget->hasFp256() && VT.is256BitVector() &&
22588 N->getOpcode() == ISD::VECTOR_SHUFFLE)
22589 return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22590
22591 // During Type Legalization, when promoting illegal vector types,
22592 // the backend might introduce new shuffle dag nodes and bitcasts.
22593 //
22594 // This code performs the following transformation:
22595 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22596 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22597 //
22598 // We do this only if both the bitcast and the BINOP dag nodes have
22599 // one use. Also, perform this transformation only if the new binary
22600 // operation is legal. This is to avoid introducing dag nodes that
22601 // potentially need to be further expanded (or custom lowered) into a
22602 // less optimal sequence of dag nodes.
22603 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22604 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22605 N0.getOpcode() == ISD::BITCAST) {
22606 SDValue BC0 = N0.getOperand(0);
22607 EVT SVT = BC0.getValueType();
22608 unsigned Opcode = BC0.getOpcode();
22609 unsigned NumElts = VT.getVectorNumElements();
22610
22611 if (BC0.hasOneUse() && SVT.isVector() &&
22612 SVT.getVectorNumElements() * 2 == NumElts &&
22613 TLI.isOperationLegal(Opcode, VT)) {
22614 bool CanFold = false;
22615 switch (Opcode) {
22616 default : break;
22617 case ISD::ADD :
22618 case ISD::FADD :
22619 case ISD::SUB :
22620 case ISD::FSUB :
22621 case ISD::MUL :
22622 case ISD::FMUL :
22623 CanFold = true;
22624 }
22625
22626 unsigned SVTNumElts = SVT.getVectorNumElements();
22627 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22628 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22629 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22630 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22631 CanFold = SVOp->getMaskElt(i) < 0;
22632
22633 if (CanFold) {
22634 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22635 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22636 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22637 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22638 }
22639 }
22640 }
22641
22642 // Only handle 128 wide vector from here on.
22643 if (!VT.is128BitVector())
2
Taking false branch
22644 return SDValue();
22645
22646 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22647 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22648 // consecutive, non-overlapping, and in the right order.
22649 SmallVector<SDValue, 16> Elts;
22650 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
3
Assuming 'i' is equal to 'e'
4
Loop condition is false. Execution continues on line 22653
22651 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22652
22653 SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
5
Calling 'EltsFromConsecutiveLoads'
22654 if (LD.getNode())
22655 return LD;
22656
22657 if (isTargetShuffle(N->getOpcode())) {
22658 SDValue Shuffle =
22659 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22660 if (Shuffle.getNode())
22661 return Shuffle;
22662
22663 // Try recursively combining arbitrary sequences of x86 shuffle
22664 // instructions into higher-order shuffles. We do this after combining
22665 // specific PSHUF instruction sequences into their minimal form so that we
22666 // can evaluate how many specialized shuffle instructions are involved in
22667 // a particular chain.
22668 SmallVector<int, 1> NonceMask; // Just a placeholder.
22669 NonceMask.push_back(0);
22670 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22671 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22672 DCI, Subtarget))
22673 return SDValue(); // This routine will use CombineTo to replace N.
22674 }
22675
22676 return SDValue();
22677}
22678
22679/// PerformTruncateCombine - Converts truncate operation to
22680/// a sequence of vector shuffle operations.
22681/// It is possible when we truncate 256-bit vector to 128-bit vector
22682static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22683 TargetLowering::DAGCombinerInfo &DCI,
22684 const X86Subtarget *Subtarget) {
22685 return SDValue();
22686}
22687
22688/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22689/// specific shuffle of a load can be folded into a single element load.
22690/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22691/// shuffles have been custom lowered so we need to handle those here.
22692static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22693 TargetLowering::DAGCombinerInfo &DCI) {
22694 if (DCI.isBeforeLegalizeOps())
22695 return SDValue();
22696
22697 SDValue InVec = N->getOperand(0);
22698 SDValue EltNo = N->getOperand(1);
22699
22700 if (!isa<ConstantSDNode>(EltNo))
22701 return SDValue();
22702
22703 EVT OriginalVT = InVec.getValueType();
22704
22705 if (InVec.getOpcode() == ISD::BITCAST) {
22706 // Don't duplicate a load with other uses.
22707 if (!InVec.hasOneUse())
22708 return SDValue();
22709 EVT BCVT = InVec.getOperand(0).getValueType();
22710 if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22711 return SDValue();
22712 InVec = InVec.getOperand(0);
22713 }
22714
22715 EVT CurrentVT = InVec.getValueType();
22716
22717 if (!isTargetShuffle(InVec.getOpcode()))
22718 return SDValue();
22719
22720 // Don't duplicate a load with other uses.
22721 if (!InVec.hasOneUse())
22722 return SDValue();
22723
22724 SmallVector<int, 16> ShuffleMask;
22725 bool UnaryShuffle;
22726 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22727 ShuffleMask, UnaryShuffle))
22728 return SDValue();
22729
22730 // Select the input vector, guarding against out of range extract vector.
22731 unsigned NumElems = CurrentVT.getVectorNumElements();
22732 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22733 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22734 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22735 : InVec.getOperand(1);
22736
22737 // If inputs to shuffle are the same for both ops, then allow 2 uses
22738 unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22739 InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22740
22741 if (LdNode.getOpcode() == ISD::BITCAST) {
22742 // Don't duplicate a load with other uses.
22743 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22744 return SDValue();
22745
22746 AllowedUses = 1; // only allow 1 load use if we have a bitcast
22747 LdNode = LdNode.getOperand(0);
22748 }
22749
22750 if (!ISD::isNormalLoad(LdNode.getNode()))
22751 return SDValue();
22752
22753 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22754
22755 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22756 return SDValue();
22757
22758 EVT EltVT = N->getValueType(0);
22759 // If there's a bitcast before the shuffle, check if the load type and
22760 // alignment is valid.
22761 unsigned Align = LN0->getAlignment();
22762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22763 unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22764 EltVT.getTypeForEVT(*DAG.getContext()));
22765
22766 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22767 return SDValue();
22768
22769 // All checks match so transform back to vector_shuffle so that DAG combiner
22770 // can finish the job
22771 SDLoc dl(N);
22772
22773 // Create shuffle node taking into account the case that its a unary shuffle
22774 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22775 : InVec.getOperand(1);
22776 Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22777 InVec.getOperand(0), Shuffle,
22778 &ShuffleMask[0]);
22779 Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22780 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22781 EltNo);
22782}
22783
22784/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22785/// generation and convert it from being a bunch of shuffles and extracts
22786/// into a somewhat faster sequence. For i686, the best sequence is apparently
22787/// storing the value and loading scalars back, while for x64 we should
22788/// use 64-bit extracts and shifts.
22789static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22790 TargetLowering::DAGCombinerInfo &DCI) {
22791 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22792 if (NewOp.getNode())
22793 return NewOp;
22794
22795 SDValue InputVector = N->getOperand(0);
22796
22797 // Detect whether we are trying to convert from mmx to i32 and the bitcast
22798 // from mmx to v2i32 has a single usage.
22799 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
22800 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
22801 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
22802 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
22803 N->getValueType(0),
22804 InputVector.getNode()->getOperand(0));
22805
22806 // Only operate on vectors of 4 elements, where the alternative shuffling
22807 // gets to be more expensive.
22808 if (InputVector.getValueType() != MVT::v4i32)
22809 return SDValue();
22810
22811 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
22812 // single use which is a sign-extend or zero-extend, and all elements are
22813 // used.
22814 SmallVector<SDNode *, 4> Uses;
22815 unsigned ExtractedElements = 0;
22816 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
22817 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
22818 if (UI.getUse().getResNo() != InputVector.getResNo())
22819 return SDValue();
22820
22821 SDNode *Extract = *UI;
22822 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22823 return SDValue();
22824
22825 if (Extract->getValueType(0) != MVT::i32)
22826 return SDValue();
22827 if (!Extract->hasOneUse())
22828 return SDValue();
22829 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
22830 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
22831 return SDValue();
22832 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
22833 return SDValue();
22834
22835 // Record which element was extracted.
22836 ExtractedElements |=
22837 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
22838
22839 Uses.push_back(Extract);
22840 }
22841
22842 // If not all the elements were used, this may not be worthwhile.
22843 if (ExtractedElements != 15)
22844 return SDValue();
22845
22846 // Ok, we've now decided to do the transformation.
22847 // If 64-bit shifts are legal, use the extract-shift sequence,
22848 // otherwise bounce the vector off the cache.
22849 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22850 SDValue Vals[4];
22851 SDLoc dl(InputVector);
22852
22853 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
22854 SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
22855 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
22856 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22857 DAG.getConstant(0, VecIdxTy));
22858 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22859 DAG.getConstant(1, VecIdxTy));
22860
22861 SDValue ShAmt = DAG.getConstant(32,
22862 DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
22863 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
22864 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22865 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
22866 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
22867 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22868 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
22869 } else {
22870 // Store the value to a temporary stack slot.
22871 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
22872 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
22873 MachinePointerInfo(), false, false, 0);
22874
22875 EVT ElementType = InputVector.getValueType().getVectorElementType();
22876 unsigned EltSize = ElementType.getSizeInBits() / 8;
22877
22878 // Replace each use (extract) with a load of the appropriate element.
22879 for (unsigned i = 0; i < 4; ++i) {
22880 uint64_t Offset = EltSize * i;
22881 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
22882
22883 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
22884 StackPtr, OffsetVal);
22885
22886 // Load the scalar.
22887 Vals[i] = DAG.getLoad(ElementType, dl, Ch,
22888 ScalarAddr, MachinePointerInfo(),
22889 false, false, false, 0);
22890
22891 }
22892 }
22893
22894 // Replace the extracts
22895 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
22896 UE = Uses.end(); UI != UE; ++UI) {
22897 SDNode *Extract = *UI;
22898
22899 SDValue Idx = Extract->getOperand(1);
22900 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
22901 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
22902 }
22903
22904 // The replacement was made in place; don't return anything.
22905 return SDValue();
22906}
22907
22908/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
22909static std::pair<unsigned, bool>
22910matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
22911 SelectionDAG &DAG, const X86Subtarget *Subtarget) {
22912 if (!VT.isVector())
22913 return std::make_pair(0, false);
22914
22915 bool NeedSplit = false;
22916 switch (VT.getSimpleVT().SimpleTy) {
22917 default: return std::make_pair(0, false);
22918 case MVT::v4i64:
22919 case MVT::v2i64:
22920 if (!Subtarget->hasVLX())
22921 return std::make_pair(0, false);
22922 break;
22923 case MVT::v64i8:
22924 case MVT::v32i16:
22925 if (!Subtarget->hasBWI())
22926 return std::make_pair(0, false);
22927 break;
22928 case MVT::v16i32:
22929 case MVT::v8i64:
22930 if (!Subtarget->hasAVX512())
22931 return std::make_pair(0, false);
22932 break;
22933 case MVT::v32i8:
22934 case MVT::v16i16:
22935 case MVT::v8i32:
22936 if (!Subtarget->hasAVX2())
22937 NeedSplit = true;
22938 if (!Subtarget->hasAVX())
22939 return std::make_pair(0, false);
22940 break;
22941 case MVT::v16i8:
22942 case MVT::v8i16:
22943 case MVT::v4i32:
22944 if (!Subtarget->hasSSE2())
22945 return std::make_pair(0, false);
22946 }
22947
22948 // SSE2 has only a small subset of the operations.
22949 bool hasUnsigned = Subtarget->hasSSE41() ||
22950 (Subtarget->hasSSE2() && VT == MVT::v16i8);
22951 bool hasSigned = Subtarget->hasSSE41() ||
22952 (Subtarget->hasSSE2() && VT == MVT::v8i16);
22953
22954 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
22955
22956 unsigned Opc = 0;
22957 // Check for x CC y ? x : y.
22958 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
22959 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
22960 switch (CC) {
22961 default: break;
22962 case ISD::SETULT:
22963 case ISD::SETULE:
22964 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
22965 case ISD::SETUGT:
22966 case ISD::SETUGE:
22967 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
22968 case ISD::SETLT:
22969 case ISD::SETLE:
22970 Opc = hasSigned ? X86ISD::SMIN : 0; break;
22971 case ISD::SETGT:
22972 case ISD::SETGE:
22973 Opc = hasSigned ? X86ISD::SMAX : 0; break;
22974 }
22975 // Check for x CC y ? y : x -- a min/max with reversed arms.
22976 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
22977 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
22978 switch (CC) {
22979 default: break;
22980 case ISD::SETULT:
22981 case ISD::SETULE:
22982 Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
22983 case ISD::SETUGT:
22984 case ISD::SETUGE:
22985 Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
22986 case ISD::SETLT:
22987 case ISD::SETLE:
22988 Opc = hasSigned ? X86ISD::SMAX : 0; break;
22989 case ISD::SETGT:
22990 case ISD::SETGE:
22991 Opc = hasSigned ? X86ISD::SMIN : 0; break;
22992 }
22993 }
22994
22995 return std::make_pair(Opc, NeedSplit);
22996}
22997
22998static SDValue
22999transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23000 const X86Subtarget *Subtarget) {
23001 SDLoc dl(N);
23002 SDValue Cond = N->getOperand(0);
23003 SDValue LHS = N->getOperand(1);
23004 SDValue RHS = N->getOperand(2);
23005
23006 if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23007 SDValue CondSrc = Cond->getOperand(0);
23008 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23009 Cond = CondSrc->getOperand(0);
23010 }
23011
23012 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23013 return SDValue();
23014
23015 // A vselect where all conditions and data are constants can be optimized into
23016 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23017 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23018 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23019 return SDValue();
23020
23021 unsigned MaskValue = 0;
23022 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23023 return SDValue();
23024
23025 MVT VT = N->getSimpleValueType(0);
23026 unsigned NumElems = VT.getVectorNumElements();
23027 SmallVector<int, 8> ShuffleMask(NumElems, -1);
23028 for (unsigned i = 0; i < NumElems; ++i) {
23029 // Be sure we emit undef where we can.
23030 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23031 ShuffleMask[i] = -1;
23032 else
23033 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23034 }
23035
23036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23037 if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23038 return SDValue();
23039 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23040}
23041
23042/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23043/// nodes.
23044static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23045 TargetLowering::DAGCombinerInfo &DCI,
23046 const X86Subtarget *Subtarget) {
23047 SDLoc DL(N);
23048 SDValue Cond = N->getOperand(0);
23049 // Get the LHS/RHS of the select.
23050 SDValue LHS = N->getOperand(1);
23051 SDValue RHS = N->getOperand(2);
23052 EVT VT = LHS.getValueType();
23053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23054
23055 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23056 // instructions match the semantics of the common C idiom x<y?x:y but not
23057 // x<=y?x:y, because of how they handle negative zero (which can be
23058 // ignored in unsafe-math mode).
23059 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23060 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23061 VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23062 (Subtarget->hasSSE2() ||
23063 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23064 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23065
23066 unsigned Opcode = 0;
23067 // Check for x CC y ? x : y.
23068 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23069 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23070 switch (CC) {
23071 default: break;
23072 case ISD::SETULT:
23073 // Converting this to a min would handle NaNs incorrectly, and swapping
23074 // the operands would cause it to handle comparisons between positive
23075 // and negative zero incorrectly.
23076 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23077 if (!DAG.getTarget().Options.UnsafeFPMath &&
23078 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23079 break;
23080 std::swap(LHS, RHS);
23081 }
23082 Opcode = X86ISD::FMIN;
23083 break;
23084 case ISD::SETOLE:
23085 // Converting this to a min would handle comparisons between positive
23086 // and negative zero incorrectly.
23087 if (!DAG.getTarget().Options.UnsafeFPMath &&
23088 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23089 break;
23090 Opcode = X86ISD::FMIN;
23091 break;
23092 case ISD::SETULE:
23093 // Converting this to a min would handle both negative zeros and NaNs
23094 // incorrectly, but we can swap the operands to fix both.
23095 std::swap(LHS, RHS);
23096 case ISD::SETOLT:
23097 case ISD::SETLT:
23098 case ISD::SETLE:
23099 Opcode = X86ISD::FMIN;
23100 break;
23101
23102 case ISD::SETOGE:
23103 // Converting this to a max would handle comparisons between positive
23104 // and negative zero incorrectly.
23105 if (!DAG.getTarget().Options.UnsafeFPMath &&
23106 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23107 break;
23108 Opcode = X86ISD::FMAX;
23109 break;
23110 case ISD::SETUGT:
23111 // Converting this to a max would handle NaNs incorrectly, and swapping
23112 // the operands would cause it to handle comparisons between positive
23113 // and negative zero incorrectly.
23114 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23115 if (!DAG.getTarget().Options.UnsafeFPMath &&
23116 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23117 break;
23118 std::swap(LHS, RHS);
23119 }
23120 Opcode = X86ISD::FMAX;
23121 break;
23122 case ISD::SETUGE:
23123 // Converting this to a max would handle both negative zeros and NaNs
23124 // incorrectly, but we can swap the operands to fix both.
23125 std::swap(LHS, RHS);
23126 case ISD::SETOGT:
23127 case ISD::SETGT:
23128 case ISD::SETGE:
23129 Opcode = X86ISD::FMAX;
23130 break;
23131 }
23132 // Check for x CC y ? y : x -- a min/max with reversed arms.
23133 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23134 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23135 switch (CC) {
23136 default: break;
23137 case ISD::SETOGE:
23138 // Converting this to a min would handle comparisons between positive
23139 // and negative zero incorrectly, and swapping the operands would
23140 // cause it to handle NaNs incorrectly.
23141 if (!DAG.getTarget().Options.UnsafeFPMath &&
23142 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23143 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23144 break;
23145 std::swap(LHS, RHS);
23146 }
23147 Opcode = X86ISD::FMIN;
23148 break;
23149 case ISD::SETUGT:
23150 // Converting this to a min would handle NaNs incorrectly.
23151 if (!DAG.getTarget().Options.UnsafeFPMath &&
23152 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23153 break;
23154 Opcode = X86ISD::FMIN;
23155 break;
23156 case ISD::SETUGE:
23157 // Converting this to a min would handle both negative zeros and NaNs
23158 // incorrectly, but we can swap the operands to fix both.
23159 std::swap(LHS, RHS);
23160 case ISD::SETOGT:
23161 case ISD::SETGT:
23162 case ISD::SETGE:
23163 Opcode = X86ISD::FMIN;
23164 break;
23165
23166 case ISD::SETULT:
23167 // Converting this to a max would handle NaNs incorrectly.
23168 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23169 break;
23170 Opcode = X86ISD::FMAX;
23171 break;
23172 case ISD::SETOLE:
23173 // Converting this to a max would handle comparisons between positive
23174 // and negative zero incorrectly, and swapping the operands would
23175 // cause it to handle NaNs incorrectly.
23176 if (!DAG.getTarget().Options.UnsafeFPMath &&
23177 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23178 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23179 break;
23180 std::swap(LHS, RHS);
23181 }
23182 Opcode = X86ISD::FMAX;
23183 break;
23184 case ISD::SETULE:
23185 // Converting this to a max would handle both negative zeros and NaNs
23186 // incorrectly, but we can swap the operands to fix both.
23187 std::swap(LHS, RHS);
23188 case ISD::SETOLT:
23189 case ISD::SETLT:
23190 case ISD::SETLE:
23191 Opcode = X86ISD::FMAX;
23192 break;
23193 }
23194 }
23195
23196 if (Opcode)
23197 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23198 }
23199
23200 EVT CondVT = Cond.getValueType();
23201 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23202 CondVT.getVectorElementType() == MVT::i1) {
23203 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23204 // lowering on KNL. In this case we convert it to
23205 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23206 // The same situation for all 128 and 256-bit vectors of i8 and i16.
23207 // Since SKX these selects have a proper lowering.
23208 EVT OpVT = LHS.getValueType();
23209 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23210 (OpVT.getVectorElementType() == MVT::i8 ||
23211 OpVT.getVectorElementType() == MVT::i16) &&
23212 !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23213 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23214 DCI.AddToWorklist(Cond.getNode());
23215 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23216 }
23217 }
23218 // If this is a select between two integer constants, try to do some
23219 // optimizations.
23220 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23221 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23222 // Don't do this for crazy integer types.
23223 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23224 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23225 // so that TrueC (the true value) is larger than FalseC.
23226 bool NeedsCondInvert = false;
23227
23228 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23229 // Efficiently invertible.
23230 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
23231 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
23232 isa<ConstantSDNode>(Cond.getOperand(1))))) {
23233 NeedsCondInvert = true;
23234 std::swap(TrueC, FalseC);
23235 }
23236
23237 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
23238 if (FalseC->getAPIntValue() == 0 &&
23239 TrueC->getAPIntValue().isPowerOf2()) {
23240 if (NeedsCondInvert) // Invert the condition if needed.
23241 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23242 DAG.getConstant(1, Cond.getValueType()));
23243
23244 // Zero extend the condition if needed.
23245 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23246
23247 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23248 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23249 DAG.getConstant(ShAmt, MVT::i8));
23250 }
23251
23252 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23253 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23254 if (NeedsCondInvert) // Invert the condition if needed.
23255 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23256 DAG.getConstant(1, Cond.getValueType()));
23257
23258 // Zero extend the condition if needed.
23259 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23260 FalseC->getValueType(0), Cond);
23261 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23262 SDValue(FalseC, 0));
23263 }
23264
23265 // Optimize cases that will turn into an LEA instruction. This requires
23266 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23267 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23268 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23269 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23270
23271 bool isFastMultiplier = false;
23272 if (Diff < 10) {
23273 switch ((unsigned char)Diff) {
23274 default: break;
23275 case 1: // result = add base, cond
23276 case 2: // result = lea base( , cond*2)
23277 case 3: // result = lea base(cond, cond*2)
23278 case 4: // result = lea base( , cond*4)
23279 case 5: // result = lea base(cond, cond*4)
23280 case 8: // result = lea base( , cond*8)
23281 case 9: // result = lea base(cond, cond*8)
23282 isFastMultiplier = true;
23283 break;
23284 }
23285 }
23286
23287 if (isFastMultiplier) {
23288 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23289 if (NeedsCondInvert) // Invert the condition if needed.
23290 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23291 DAG.getConstant(1, Cond.getValueType()));
23292
23293 // Zero extend the condition if needed.
23294 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23295 Cond);
23296 // Scale the condition by the difference.
23297 if (Diff != 1)
23298 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23299 DAG.getConstant(Diff, Cond.getValueType()));
23300
23301 // Add the base if non-zero.
23302 if (FalseC->getAPIntValue() != 0)
23303 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23304 SDValue(FalseC, 0));
23305 return Cond;
23306 }
23307 }
23308 }
23309 }
23310
23311 // Canonicalize max and min:
23312 // (x > y) ? x : y -> (x >= y) ? x : y
23313 // (x < y) ? x : y -> (x <= y) ? x : y
23314 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23315 // the need for an extra compare
23316 // against zero. e.g.
23317 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23318 // subl %esi, %edi
23319 // testl %edi, %edi
23320 // movl $0, %eax
23321 // cmovgl %edi, %eax
23322 // =>
23323 // xorl %eax, %eax
23324 // subl %esi, $edi
23325 // cmovsl %eax, %edi
23326 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23327 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23328 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23329 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23330 switch (CC) {
23331 default: break;
23332 case ISD::SETLT:
23333 case ISD::SETGT: {
23334 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23335 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23336 Cond.getOperand(0), Cond.getOperand(1), NewCC);
23337 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23338 }
23339 }
23340 }
23341
23342 // Early exit check
23343 if (!TLI.isTypeLegal(VT))
23344 return SDValue();
23345
23346 // Match VSELECTs into subs with unsigned saturation.
23347 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23348 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23349 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23350 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23351 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23352
23353 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23354 // left side invert the predicate to simplify logic below.
23355 SDValue Other;
23356 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23357 Other = RHS;
23358 CC = ISD::getSetCCInverse(CC, true);
23359 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23360 Other = LHS;
23361 }
23362
23363 if (Other.getNode() && Other->getNumOperands() == 2 &&
23364 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23365 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23366 SDValue CondRHS = Cond->getOperand(1);
23367
23368 // Look for a general sub with unsigned saturation first.
23369 // x >= y ? x-y : 0 --> subus x, y
23370 // x > y ? x-y : 0 --> subus x, y
23371 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23372 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23373 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23374
23375 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23376 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23377 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23378 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23379 // If the RHS is a constant we have to reverse the const
23380 // canonicalization.
23381 // x > C-1 ? x+-C : 0 --> subus x, C
23382 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23383 CondRHSConst->getAPIntValue() ==
23384 (-OpRHSConst->getAPIntValue() - 1))
23385 return DAG.getNode(
23386 X86ISD::SUBUS, DL, VT, OpLHS,
23387 DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23388
23389 // Another special case: If C was a sign bit, the sub has been
23390 // canonicalized into a xor.
23391 // FIXME: Would it be better to use computeKnownBits to determine
23392 // whether it's safe to decanonicalize the xor?
23393 // x s< 0 ? x^C : 0 --> subus x, C
23394 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23395 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23396 OpRHSConst->getAPIntValue().isSignBit())
23397 // Note that we have to rebuild the RHS constant here to ensure we
23398 // don't rely on particular values of undef lanes.
23399 return DAG.getNode(
23400 X86ISD::SUBUS, DL, VT, OpLHS,
23401 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23402 }
23403 }
23404 }
23405
23406 // Try to match a min/max vector operation.
23407 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23408 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23409 unsigned Opc = ret.first;
23410 bool NeedSplit = ret.second;
23411
23412 if (Opc && NeedSplit) {
23413 unsigned NumElems = VT.getVectorNumElements();
23414 // Extract the LHS vectors
23415 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23416 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23417
23418 // Extract the RHS vectors
23419 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23420 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23421
23422 // Create min/max for each subvector
23423 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23424 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23425
23426 // Merge the result
23427 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23428 } else if (Opc)
23429 return DAG.getNode(Opc, DL, VT, LHS, RHS);
23430 }
23431
23432 // Simplify vector selection if condition value type matches vselect
23433 // operand type
23434 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23435 assert(Cond.getValueType().isVector() &&((Cond.getValueType().isVector() && "vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().isVector() && \"vector select expects a vector selector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23436, __PRETTY_FUNCTION__))
23436 "vector select expects a vector selector!")((Cond.getValueType().isVector() && "vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("Cond.getValueType().isVector() && \"vector select expects a vector selector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23436, __PRETTY_FUNCTION__))
;
23437
23438 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23439 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23440
23441 // Try invert the condition if true value is not all 1s and false value
23442 // is not all 0s.
23443 if (!TValIsAllOnes && !FValIsAllZeros &&
23444 // Check if the selector will be produced by CMPP*/PCMP*
23445 Cond.getOpcode() == ISD::SETCC &&
23446 // Check if SETCC has already been promoted
23447 TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23448 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23449 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23450
23451 if (TValIsAllZeros || FValIsAllOnes) {
23452 SDValue CC = Cond.getOperand(2);
23453 ISD::CondCode NewCC =
23454 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23455 Cond.getOperand(0).getValueType().isInteger());
23456 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23457 std::swap(LHS, RHS);
23458 TValIsAllOnes = FValIsAllOnes;
23459 FValIsAllZeros = TValIsAllZeros;
23460 }
23461 }
23462
23463 if (TValIsAllOnes || FValIsAllZeros) {
23464 SDValue Ret;
23465
23466 if (TValIsAllOnes && FValIsAllZeros)
23467 Ret = Cond;
23468 else if (TValIsAllOnes)
23469 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23470 DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23471 else if (FValIsAllZeros)
23472 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23473 DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23474
23475 return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23476 }
23477 }
23478
23479 // If we know that this node is legal then we know that it is going to be
23480 // matched by one of the SSE/AVX BLEND instructions. These instructions only
23481 // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23482 // to simplify previous instructions.
23483 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23484 !DCI.isBeforeLegalize() &&
23485 // We explicitly check against v8i16 and v16i16 because, although
23486 // they're marked as Custom, they might only be legal when Cond is a
23487 // build_vector of constants. This will be taken care in a later
23488 // condition.
23489 (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23490 VT != MVT::v8i16) &&
23491 // Don't optimize vector of constants. Those are handled by
23492 // the generic code and all the bits must be properly set for
23493 // the generic optimizer.
23494 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23495 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23496
23497 // Don't optimize vector selects that map to mask-registers.
23498 if (BitWidth == 1)
23499 return SDValue();
23500
23501 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size")((BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"
) ? static_cast<void> (0) : __assert_fail ("BitWidth >= 8 && BitWidth <= 64 && \"Invalid mask size\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23501, __PRETTY_FUNCTION__))
;
23502 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23503
23504 APInt KnownZero, KnownOne;
23505 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23506 DCI.isBeforeLegalizeOps());
23507 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23508 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23509 TLO)) {
23510 // If we changed the computation somewhere in the DAG, this change
23511 // will affect all users of Cond.
23512 // Make sure it is fine and update all the nodes so that we do not
23513 // use the generic VSELECT anymore. Otherwise, we may perform
23514 // wrong optimizations as we messed up with the actual expectation
23515 // for the vector boolean values.
23516 if (Cond != TLO.Old) {
23517 // Check all uses of that condition operand to check whether it will be
23518 // consumed by non-BLEND instructions, which may depend on all bits are
23519 // set properly.
23520 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23521 I != E; ++I)
23522 if (I->getOpcode() != ISD::VSELECT)
23523 // TODO: Add other opcodes eventually lowered into BLEND.
23524 return SDValue();
23525
23526 // Update all the users of the condition, before committing the change,
23527 // so that the VSELECT optimizations that expect the correct vector
23528 // boolean value will not be triggered.
23529 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23530 I != E; ++I)
23531 DAG.ReplaceAllUsesOfValueWith(
23532 SDValue(*I, 0),
23533 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23534 Cond, I->getOperand(1), I->getOperand(2)));
23535 DCI.CommitTargetLoweringOpt(TLO);
23536 return SDValue();
23537 }
23538 // At this point, only Cond is changed. Change the condition
23539 // just for N to keep the opportunity to optimize all other
23540 // users their own way.
23541 DAG.ReplaceAllUsesOfValueWith(
23542 SDValue(N, 0),
23543 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23544 TLO.New, N->getOperand(1), N->getOperand(2)));
23545 return SDValue();
23546 }
23547 }
23548
23549 // We should generate an X86ISD::BLENDI from a vselect if its argument
23550 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23551 // constants. This specific pattern gets generated when we split a
23552 // selector for a 512 bit vector in a machine without AVX512 (but with
23553 // 256-bit vectors), during legalization:
23554 //
23555 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23556 //
23557 // Iff we find this pattern and the build_vectors are built from
23558 // constants, we translate the vselect into a shuffle_vector that we
23559 // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23560 if ((N->getOpcode() == ISD::VSELECT ||
23561 N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23562 !DCI.isBeforeLegalize()) {
23563 SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23564 if (Shuffle.getNode())
23565 return Shuffle;
23566 }
23567
23568 return SDValue();
23569}
23570
23571// Check whether a boolean test is testing a boolean value generated by
23572// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23573// code.
23574//
23575// Simplify the following patterns:
23576// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23577// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23578// to (Op EFLAGS Cond)
23579//
23580// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23581// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23582// to (Op EFLAGS !Cond)
23583//
23584// where Op could be BRCOND or CMOV.
23585//
23586static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23587 // Quit if not CMP and SUB with its value result used.
23588 if (Cmp.getOpcode() != X86ISD::CMP &&
23589 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23590 return SDValue();
23591
23592 // Quit if not used as a boolean value.
23593 if (CC != X86::COND_E && CC != X86::COND_NE)
23594 return SDValue();
23595
23596 // Check CMP operands. One of them should be 0 or 1 and the other should be
23597 // an SetCC or extended from it.
23598 SDValue Op1 = Cmp.getOperand(0);
23599 SDValue Op2 = Cmp.getOperand(1);
23600
23601 SDValue SetCC;
23602 const ConstantSDNode* C = nullptr;
23603 bool needOppositeCond = (CC == X86::COND_E);
23604 bool checkAgainstTrue = false; // Is it a comparison against 1?
23605
23606 if ((C = dyn_cast<ConstantSDNode>(Op1)))
23607 SetCC = Op2;
23608 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23609 SetCC = Op1;
23610 else // Quit if all operands are not constants.
23611 return SDValue();
23612
23613 if (C->getZExtValue() == 1) {
23614 needOppositeCond = !needOppositeCond;
23615 checkAgainstTrue = true;
23616 } else if (C->getZExtValue() != 0)
23617 // Quit if the constant is neither 0 or 1.
23618 return SDValue();
23619
23620 bool truncatedToBoolWithAnd = false;
23621 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23622 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23623 SetCC.getOpcode() == ISD::TRUNCATE ||
23624 SetCC.getOpcode() == ISD::AND) {
23625 if (SetCC.getOpcode() == ISD::AND) {
23626 int OpIdx = -1;
23627 ConstantSDNode *CS;
23628 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23629 CS->getZExtValue() == 1)
23630 OpIdx = 1;
23631 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23632 CS->getZExtValue() == 1)
23633 OpIdx = 0;
23634 if (OpIdx == -1)
23635 break;
23636 SetCC = SetCC.getOperand(OpIdx);
23637 truncatedToBoolWithAnd = true;
23638 } else
23639 SetCC = SetCC.getOperand(0);
23640 }
23641
23642 switch (SetCC.getOpcode()) {
23643 case X86ISD::SETCC_CARRY:
23644 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23645 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23646 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23647 // truncated to i1 using 'and'.
23648 if (checkAgainstTrue && !truncatedToBoolWithAnd)
23649 break;
23650 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23651, __PRETTY_FUNCTION__))
23651 "Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23651, __PRETTY_FUNCTION__))
;
23652 // FALL THROUGH
23653 case X86ISD::SETCC:
23654 // Set the condition code or opposite one if necessary.
23655 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23656 if (needOppositeCond)
23657 CC = X86::GetOppositeBranchCondition(CC);
23658 return SetCC.getOperand(1);
23659 case X86ISD::CMOV: {
23660 // Check whether false/true value has canonical one, i.e. 0 or 1.
23661 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23662 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23663 // Quit if true value is not a constant.
23664 if (!TVal)
23665 return SDValue();
23666 // Quit if false value is not a constant.
23667 if (!FVal) {
23668 SDValue Op = SetCC.getOperand(0);
23669 // Skip 'zext' or 'trunc' node.
23670 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23671 Op.getOpcode() == ISD::TRUNCATE)
23672 Op = Op.getOperand(0);
23673 // A special case for rdrand/rdseed, where 0 is set if false cond is
23674 // found.
23675 if ((Op.getOpcode() != X86ISD::RDRAND &&
23676 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23677 return SDValue();
23678 }
23679 // Quit if false value is not the constant 0 or 1.
23680 bool FValIsFalse = true;
23681 if (FVal && FVal->getZExtValue() != 0) {
23682 if (FVal->getZExtValue() != 1)
23683 return SDValue();
23684 // If FVal is 1, opposite cond is needed.
23685 needOppositeCond = !needOppositeCond;
23686 FValIsFalse = false;
23687 }
23688 // Quit if TVal is not the constant opposite of FVal.
23689 if (FValIsFalse && TVal->getZExtValue() != 1)
23690 return SDValue();
23691 if (!FValIsFalse && TVal->getZExtValue() != 0)
23692 return SDValue();
23693 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23694 if (needOppositeCond)
23695 CC = X86::GetOppositeBranchCondition(CC);
23696 return SetCC.getOperand(3);
23697 }
23698 }
23699
23700 return SDValue();
23701}
23702
23703/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23704static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23705 TargetLowering::DAGCombinerInfo &DCI,
23706 const X86Subtarget *Subtarget) {
23707 SDLoc DL(N);
23708
23709 // If the flag operand isn't dead, don't touch this CMOV.
23710 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23711 return SDValue();
23712
23713 SDValue FalseOp = N->getOperand(0);
23714 SDValue TrueOp = N->getOperand(1);
23715 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23716 SDValue Cond = N->getOperand(3);
23717
23718 if (CC == X86::COND_E || CC == X86::COND_NE) {
23719 switch (Cond.getOpcode()) {
23720 default: break;
23721 case X86ISD::BSR:
23722 case X86ISD::BSF:
23723 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23724 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23725 return (CC == X86::COND_E) ? FalseOp : TrueOp;
23726 }
23727 }
23728
23729 SDValue Flags;
23730
23731 Flags = checkBoolTestSetCCCombine(Cond, CC);
23732 if (Flags.getNode() &&
23733 // Extra check as FCMOV only supports a subset of X86 cond.
23734 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23735 SDValue Ops[] = { FalseOp, TrueOp,
23736 DAG.getConstant(CC, MVT::i8), Flags };
23737 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23738 }
23739
23740 // If this is a select between two integer constants, try to do some
23741 // optimizations. Note that the operands are ordered the opposite of SELECT
23742 // operands.
23743 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23744 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23745 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23746 // larger than FalseC (the false value).
23747 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23748 CC = X86::GetOppositeBranchCondition(CC);
23749 std::swap(TrueC, FalseC);
23750 std::swap(TrueOp, FalseOp);
23751 }
23752
23753 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
23754 // This is efficient for any integer data type (including i8/i16) and
23755 // shift amount.
23756 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23757 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23758 DAG.getConstant(CC, MVT::i8), Cond);
23759
23760 // Zero extend the condition if needed.
23761 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23762
23763 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23764 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23765 DAG.getConstant(ShAmt, MVT::i8));
23766 if (N->getNumValues() == 2) // Dead flag value?
23767 return DCI.CombineTo(N, Cond, SDValue());
23768 return Cond;
23769 }
23770
23771 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
23772 // for any integer data type, including i8/i16.
23773 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23774 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23775 DAG.getConstant(CC, MVT::i8), Cond);
23776
23777 // Zero extend the condition if needed.
23778 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23779 FalseC->getValueType(0), Cond);
23780 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23781 SDValue(FalseC, 0));
23782
23783 if (N->getNumValues() == 2) // Dead flag value?
23784 return DCI.CombineTo(N, Cond, SDValue());
23785 return Cond;
23786 }
23787
23788 // Optimize cases that will turn into an LEA instruction. This requires
23789 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23790 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23791 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23792 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23793
23794 bool isFastMultiplier = false;
23795 if (Diff < 10) {
23796 switch ((unsigned char)Diff) {
23797 default: break;
23798 case 1: // result = add base, cond
23799 case 2: // result = lea base( , cond*2)
23800 case 3: // result = lea base(cond, cond*2)
23801 case 4: // result = lea base( , cond*4)
23802 case 5: // result = lea base(cond, cond*4)
23803 case 8: // result = lea base( , cond*8)
23804 case 9: // result = lea base(cond, cond*8)
23805 isFastMultiplier = true;
23806 break;
23807 }
23808 }
23809
23810 if (isFastMultiplier) {
23811 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23812 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23813 DAG.getConstant(CC, MVT::i8), Cond);
23814 // Zero extend the condition if needed.
23815 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23816 Cond);
23817 // Scale the condition by the difference.
23818 if (Diff != 1)
23819 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23820 DAG.getConstant(Diff, Cond.getValueType()));
23821
23822 // Add the base if non-zero.
23823 if (FalseC->getAPIntValue() != 0)
23824 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23825 SDValue(FalseC, 0));
23826 if (N->getNumValues() == 2) // Dead flag value?
23827 return DCI.CombineTo(N, Cond, SDValue());
23828 return Cond;
23829 }
23830 }
23831 }
23832 }
23833
23834 // Handle these cases:
23835 // (select (x != c), e, c) -> select (x != c), e, x),
23836 // (select (x == c), c, e) -> select (x == c), x, e)
23837 // where the c is an integer constant, and the "select" is the combination
23838 // of CMOV and CMP.
23839 //
23840 // The rationale for this change is that the conditional-move from a constant
23841 // needs two instructions, however, conditional-move from a register needs
23842 // only one instruction.
23843 //
23844 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
23845 // some instruction-combining opportunities. This opt needs to be
23846 // postponed as late as possible.
23847 //
23848 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
23849 // the DCI.xxxx conditions are provided to postpone the optimization as
23850 // late as possible.
23851
23852 ConstantSDNode *CmpAgainst = nullptr;
23853 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
23854 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
23855 !isa<ConstantSDNode>(Cond.getOperand(0))) {
23856
23857 if (CC == X86::COND_NE &&
23858 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
23859 CC = X86::GetOppositeBranchCondition(CC);
23860 std::swap(TrueOp, FalseOp);
23861 }
23862
23863 if (CC == X86::COND_E &&
23864 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
23865 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
23866 DAG.getConstant(CC, MVT::i8), Cond };
23867 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
23868 }
23869 }
23870 }
23871
23872 return SDValue();
23873}
23874
23875static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
23876 const X86Subtarget *Subtarget) {
23877 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
23878 switch (IntNo) {
23879 default: return SDValue();
23880 // SSE/AVX/AVX2 blend intrinsics.
23881 case Intrinsic::x86_avx2_pblendvb:
23882 case Intrinsic::x86_avx2_pblendw:
23883 case Intrinsic::x86_avx2_pblendd_128:
23884 case Intrinsic::x86_avx2_pblendd_256:
23885 // Don't try to simplify this intrinsic if we don't have AVX2.
23886 if (!Subtarget->hasAVX2())
23887 return SDValue();
23888 // FALL-THROUGH
23889 case Intrinsic::x86_avx_blend_pd_256:
23890 case Intrinsic::x86_avx_blend_ps_256:
23891 case Intrinsic::x86_avx_blendv_pd_256:
23892 case Intrinsic::x86_avx_blendv_ps_256:
23893 // Don't try to simplify this intrinsic if we don't have AVX.
23894 if (!Subtarget->hasAVX())
23895 return SDValue();
23896 // FALL-THROUGH
23897 case Intrinsic::x86_sse41_pblendw:
23898 case Intrinsic::x86_sse41_blendpd:
23899 case Intrinsic::x86_sse41_blendps:
23900 case Intrinsic::x86_sse41_blendvps:
23901 case Intrinsic::x86_sse41_blendvpd:
23902 case Intrinsic::x86_sse41_pblendvb: {
23903 SDValue Op0 = N->getOperand(1);
23904 SDValue Op1 = N->getOperand(2);
23905 SDValue Mask = N->getOperand(3);
23906
23907 // Don't try to simplify this intrinsic if we don't have SSE4.1.
23908 if (!Subtarget->hasSSE41())
23909 return SDValue();
23910
23911 // fold (blend A, A, Mask) -> A
23912 if (Op0 == Op1)
23913 return Op0;
23914 // fold (blend A, B, allZeros) -> A
23915 if (ISD::isBuildVectorAllZeros(Mask.getNode()))
23916 return Op0;
23917 // fold (blend A, B, allOnes) -> B
23918 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
23919 return Op1;
23920
23921 // Simplify the case where the mask is a constant i32 value.
23922 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
23923 if (C->isNullValue())
23924 return Op0;
23925 if (C->isAllOnesValue())
23926 return Op1;
23927 }
23928
23929 return SDValue();
23930 }
23931
23932 // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
23933 case Intrinsic::x86_sse2_psrai_w:
23934 case Intrinsic::x86_sse2_psrai_d:
23935 case Intrinsic::x86_avx2_psrai_w:
23936 case Intrinsic::x86_avx2_psrai_d:
23937 case Intrinsic::x86_sse2_psra_w:
23938 case Intrinsic::x86_sse2_psra_d:
23939 case Intrinsic::x86_avx2_psra_w:
23940 case Intrinsic::x86_avx2_psra_d: {
23941 SDValue Op0 = N->getOperand(1);
23942 SDValue Op1 = N->getOperand(2);
23943 EVT VT = Op0.getValueType();
23944 assert(VT.isVector() && "Expected a vector type!")((VT.isVector() && "Expected a vector type!") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected a vector type!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 23944, __PRETTY_FUNCTION__))
;
23945
23946 if (isa<BuildVectorSDNode>(Op1))
23947 Op1 = Op1.getOperand(0);
23948
23949 if (!isa<ConstantSDNode>(Op1))
23950 return SDValue();
23951
23952 EVT SVT = VT.getVectorElementType();
23953 unsigned SVTBits = SVT.getSizeInBits();
23954
23955 ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
23956 const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
23957 uint64_t ShAmt = C.getZExtValue();
23958
23959 // Don't try to convert this shift into a ISD::SRA if the shift
23960 // count is bigger than or equal to the element size.
23961 if (ShAmt >= SVTBits)
23962 return SDValue();
23963
23964 // Trivial case: if the shift count is zero, then fold this
23965 // into the first operand.
23966 if (ShAmt == 0)
23967 return Op0;
23968
23969 // Replace this packed shift intrinsic with a target independent
23970 // shift dag node.
23971 SDValue Splat = DAG.getConstant(C, VT);
23972 return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
23973 }
23974 }
23975}
23976
23977/// PerformMulCombine - Optimize a single multiply with constant into two
23978/// in order to implement it with two cheaper instructions, e.g.
23979/// LEA + SHL, LEA + LEA.
23980static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
23981 TargetLowering::DAGCombinerInfo &DCI) {
23982 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
23983 return SDValue();
23984
23985 EVT VT = N->getValueType(0);
23986 if (VT != MVT::i64 && VT != MVT::i32)
23987 return SDValue();
23988
23989 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23990 if (!C)
23991 return SDValue();
23992 uint64_t MulAmt = C->getZExtValue();
23993 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
23994 return SDValue();
23995
23996 uint64_t MulAmt1 = 0;
23997 uint64_t MulAmt2 = 0;
23998 if ((MulAmt % 9) == 0) {
23999 MulAmt1 = 9;
24000 MulAmt2 = MulAmt / 9;
24001 } else if ((MulAmt % 5) == 0) {
24002 MulAmt1 = 5;
24003 MulAmt2 = MulAmt / 5;
24004 } else if ((MulAmt % 3) == 0) {
24005 MulAmt1 = 3;
24006 MulAmt2 = MulAmt / 3;
24007 }
24008 if (MulAmt2 &&
24009 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24010 SDLoc DL(N);
24011
24012 if (isPowerOf2_64(MulAmt2) &&
24013 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24014 // If second multiplifer is pow2, issue it first. We want the multiply by
24015 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24016 // is an add.
24017 std::swap(MulAmt1, MulAmt2);
24018
24019 SDValue NewMul;
24020 if (isPowerOf2_64(MulAmt1))
24021 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24022 DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24023 else
24024 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24025 DAG.getConstant(MulAmt1, VT));
24026
24027 if (isPowerOf2_64(MulAmt2))
24028 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24029 DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24030 else
24031 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24032 DAG.getConstant(MulAmt2, VT));
24033
24034 // Do not add new nodes to DAG combiner worklist.
24035 DCI.CombineTo(N, NewMul, false);
24036 }
24037 return SDValue();
24038}
24039
24040static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24041 SDValue N0 = N->getOperand(0);
24042 SDValue N1 = N->getOperand(1);
24043 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24044 EVT VT = N0.getValueType();
24045
24046 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24047 // since the result of setcc_c is all zero's or all ones.
24048 if (VT.isInteger() && !VT.isVector() &&
24049 N1C && N0.getOpcode() == ISD::AND &&
24050 N0.getOperand(1).getOpcode() == ISD::Constant) {
24051 SDValue N00 = N0.getOperand(0);
24052 if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24053 ((N00.getOpcode() == ISD::ANY_EXTEND ||
24054 N00.getOpcode() == ISD::ZERO_EXTEND) &&
24055 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24056 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24057 APInt ShAmt = N1C->getAPIntValue();
24058 Mask = Mask.shl(ShAmt);
24059 if (Mask != 0)
24060 return DAG.getNode(ISD::AND, SDLoc(N), VT,
24061 N00, DAG.getConstant(Mask, VT));
24062 }
24063 }
24064
24065 // Hardware support for vector shifts is sparse which makes us scalarize the
24066 // vector operations in many cases. Also, on sandybridge ADD is faster than
24067 // shl.
24068 // (shl V, 1) -> add V,V
24069 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24070 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24071 assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24071, __PRETTY_FUNCTION__))
;
24072 // We shift all of the values by one. In many cases we do not have
24073 // hardware support for this operation. This is better expressed as an ADD
24074 // of two values.
24075 if (N1SplatC->getZExtValue() == 1)
24076 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24077 }
24078
24079 return SDValue();
24080}
24081
24082/// \brief Returns a vector of 0s if the node in input is a vector logical
24083/// shift by a constant amount which is known to be bigger than or equal
24084/// to the vector element size in bits.
24085static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24086 const X86Subtarget *Subtarget) {
24087 EVT VT = N->getValueType(0);
24088
24089 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24090 (!Subtarget->hasInt256() ||
24091 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24092 return SDValue();
24093
24094 SDValue Amt = N->getOperand(1);
24095 SDLoc DL(N);
24096 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24097 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24098 APInt ShiftAmt = AmtSplat->getAPIntValue();
24099 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24100
24101 // SSE2/AVX2 logical shifts always return a vector of 0s
24102 // if the shift amount is bigger than or equal to
24103 // the element size. The constant shift amount will be
24104 // encoded as a 8-bit immediate.
24105 if (ShiftAmt.trunc(8).uge(MaxAmount))
24106 return getZeroVector(VT, Subtarget, DAG, DL);
24107 }
24108
24109 return SDValue();
24110}
24111
24112/// PerformShiftCombine - Combine shifts.
24113static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24114 TargetLowering::DAGCombinerInfo &DCI,
24115 const X86Subtarget *Subtarget) {
24116 if (N->getOpcode() == ISD::SHL) {
24117 SDValue V = PerformSHLCombine(N, DAG);
24118 if (V.getNode()) return V;
24119 }
24120
24121 if (N->getOpcode() != ISD::SRA) {
24122 // Try to fold this logical shift into a zero vector.
24123 SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24124 if (V.getNode()) return V;
24125 }
24126
24127 return SDValue();
24128}
24129
24130// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
24131// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24132// and friends. Likewise for OR -> CMPNEQSS.
24133static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24134 TargetLowering::DAGCombinerInfo &DCI,
24135 const X86Subtarget *Subtarget) {
24136 unsigned opcode;
24137
24138 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24139 // we're requiring SSE2 for both.
24140 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24141 SDValue N0 = N->getOperand(0);
24142 SDValue N1 = N->getOperand(1);
24143 SDValue CMP0 = N0->getOperand(1);
24144 SDValue CMP1 = N1->getOperand(1);
24145 SDLoc DL(N);
24146
24147 // The SETCCs should both refer to the same CMP.
24148 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24149 return SDValue();
24150
24151 SDValue CMP00 = CMP0->getOperand(0);
24152 SDValue CMP01 = CMP0->getOperand(1);
24153 EVT VT = CMP00.getValueType();
24154
24155 if (VT == MVT::f32 || VT == MVT::f64) {
24156 bool ExpectingFlags = false;
24157 // Check for any users that want flags:
24158 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24159 !ExpectingFlags && UI != UE; ++UI)
24160 switch (UI->getOpcode()) {
24161 default:
24162 case ISD::BR_CC:
24163 case ISD::BRCOND:
24164 case ISD::SELECT:
24165 ExpectingFlags = true;
24166 break;
24167 case ISD::CopyToReg:
24168 case ISD::SIGN_EXTEND:
24169 case ISD::ZERO_EXTEND:
24170 case ISD::ANY_EXTEND:
24171 break;
24172 }
24173
24174 if (!ExpectingFlags) {
24175 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24176 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24177
24178 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24179 X86::CondCode tmp = cc0;
24180 cc0 = cc1;
24181 cc1 = tmp;
24182 }
24183
24184 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
24185 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24186 // FIXME: need symbolic constants for these magic numbers.
24187 // See X86ATTInstPrinter.cpp:printSSECC().
24188 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24189 if (Subtarget->hasAVX512()) {
24190 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24191 CMP01, DAG.getConstant(x86cc, MVT::i8));
24192 if (N->getValueType(0) != MVT::i1)
24193 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24194 FSetCC);
24195 return FSetCC;
24196 }
24197 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24198 CMP00.getValueType(), CMP00, CMP01,
24199 DAG.getConstant(x86cc, MVT::i8));
24200
24201 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24202 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24203
24204 if (is64BitFP && !Subtarget->is64Bit()) {
24205 // On a 32-bit target, we cannot bitcast the 64-bit float to a
24206 // 64-bit integer, since that's not a legal type. Since
24207 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24208 // bits, but can do this little dance to extract the lowest 32 bits
24209 // and work with those going forward.
24210 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24211 OnesOrZeroesF);
24212 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24213 Vector64);
24214 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24215 Vector32, DAG.getIntPtrConstant(0));
24216 IntVT = MVT::i32;
24217 }
24218
24219 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24220 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24221 DAG.getConstant(1, IntVT));
24222 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24223 return OneBitOfTruth;
24224 }
24225 }
24226 }
24227 }
24228 return SDValue();
24229}
24230
24231/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24232/// so it can be folded inside ANDNP.
24233static bool CanFoldXORWithAllOnes(const SDNode *N) {
24234 EVT VT = N->getValueType(0);
24235
24236 // Match direct AllOnes for 128 and 256-bit vectors
24237 if (ISD::isBuildVectorAllOnes(N))
24238 return true;
24239
24240 // Look through a bit convert.
24241 if (N->getOpcode() == ISD::BITCAST)
24242 N = N->getOperand(0).getNode();
24243
24244 // Sometimes the operand may come from a insert_subvector building a 256-bit
24245 // allones vector
24246 if (VT.is256BitVector() &&
24247 N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24248 SDValue V1 = N->getOperand(0);
24249 SDValue V2 = N->getOperand(1);
24250
24251 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24252 V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24253 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24254 ISD::isBuildVectorAllOnes(V2.getNode()))
24255 return true;
24256 }
24257
24258 return false;
24259}
24260
24261// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24262// register. In most cases we actually compare or select YMM-sized registers
24263// and mixing the two types creates horrible code. This method optimizes
24264// some of the transition sequences.
24265static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24266 TargetLowering::DAGCombinerInfo &DCI,
24267 const X86Subtarget *Subtarget) {
24268 EVT VT = N->getValueType(0);
24269 if (!VT.is256BitVector())
24270 return SDValue();
24271
24272 assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__))
24273 N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__))
24274 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24274, __PRETTY_FUNCTION__))
;
24275
24276 SDValue Narrow = N->getOperand(0);
24277 EVT NarrowVT = Narrow->getValueType(0);
24278 if (!NarrowVT.is128BitVector())
24279 return SDValue();
24280
24281 if (Narrow->getOpcode() != ISD::XOR &&
24282 Narrow->getOpcode() != ISD::AND &&
24283 Narrow->getOpcode() != ISD::OR)
24284 return SDValue();
24285
24286 SDValue N0 = Narrow->getOperand(0);
24287 SDValue N1 = Narrow->getOperand(1);
24288 SDLoc DL(Narrow);
24289
24290 // The Left side has to be a trunc.
24291 if (N0.getOpcode() != ISD::TRUNCATE)
24292 return SDValue();
24293
24294 // The type of the truncated inputs.
24295 EVT WideVT = N0->getOperand(0)->getValueType(0);
24296 if (WideVT != VT)
24297 return SDValue();
24298
24299 // The right side has to be a 'trunc' or a constant vector.
24300 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24301 ConstantSDNode *RHSConstSplat = nullptr;
24302 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24303 RHSConstSplat = RHSBV->getConstantSplatNode();
24304 if (!RHSTrunc && !RHSConstSplat)
24305 return SDValue();
24306
24307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24308
24309 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24310 return SDValue();
24311
24312 // Set N0 and N1 to hold the inputs to the new wide operation.
24313 N0 = N0->getOperand(0);
24314 if (RHSConstSplat) {
24315 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24316 SDValue(RHSConstSplat, 0));
24317 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24318 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24319 } else if (RHSTrunc) {
24320 N1 = N1->getOperand(0);
24321 }
24322
24323 // Generate the wide operation.
24324 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24325 unsigned Opcode = N->getOpcode();
24326 switch (Opcode) {
24327 case ISD::ANY_EXTEND:
24328 return Op;
24329 case ISD::ZERO_EXTEND: {
24330 unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24331 APInt Mask = APInt::getAllOnesValue(InBits);
24332 Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24333 return DAG.getNode(ISD::AND, DL, VT,
24334 Op, DAG.getConstant(Mask, VT));
24335 }
24336 case ISD::SIGN_EXTEND:
24337 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24338 Op, DAG.getValueType(NarrowVT));
24339 default:
24340 llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24340)
;
24341 }
24342}
24343
24344static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24345 TargetLowering::DAGCombinerInfo &DCI,
24346 const X86Subtarget *Subtarget) {
24347 EVT VT = N->getValueType(0);
24348 if (DCI.isBeforeLegalizeOps())
24349 return SDValue();
24350
24351 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24352 if (R.getNode())
24353 return R;
24354
24355 // Create BEXTR instructions
24356 // BEXTR is ((X >> imm) & (2**size-1))
24357 if (VT == MVT::i32 || VT == MVT::i64) {
24358 SDValue N0 = N->getOperand(0);
24359 SDValue N1 = N->getOperand(1);
24360 SDLoc DL(N);
24361
24362 // Check for BEXTR.
24363 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24364 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24365 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24366 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24367 if (MaskNode && ShiftNode) {
24368 uint64_t Mask = MaskNode->getZExtValue();
24369 uint64_t Shift = ShiftNode->getZExtValue();
24370 if (isMask_64(Mask)) {
24371 uint64_t MaskSize = CountPopulation_64(Mask);
24372 if (Shift + MaskSize <= VT.getSizeInBits())
24373 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24374 DAG.getConstant(Shift | (MaskSize << 8), VT));
24375 }
24376 }
24377 } // BEXTR
24378
24379 return SDValue();
24380 }
24381
24382 // Want to form ANDNP nodes:
24383 // 1) In the hopes of then easily combining them with OR and AND nodes
24384 // to form PBLEND/PSIGN.
24385 // 2) To match ANDN packed intrinsics
24386 if (VT != MVT::v2i64 && VT != MVT::v4i64)
24387 return SDValue();
24388
24389 SDValue N0 = N->getOperand(0);
24390 SDValue N1 = N->getOperand(1);
24391 SDLoc DL(N);
24392
24393 // Check LHS for vnot
24394 if (N0.getOpcode() == ISD::XOR &&
24395 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24396 CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24397 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24398
24399 // Check RHS for vnot
24400 if (N1.getOpcode() == ISD::XOR &&
24401 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24402 CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24403 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24404
24405 return SDValue();
24406}
24407
24408static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24409 TargetLowering::DAGCombinerInfo &DCI,
24410 const X86Subtarget *Subtarget) {
24411 if (DCI.isBeforeLegalizeOps())
24412 return SDValue();
24413
24414 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24415 if (R.getNode())
24416 return R;
24417
24418 SDValue N0 = N->getOperand(0);
24419 SDValue N1 = N->getOperand(1);
24420 EVT VT = N->getValueType(0);
24421
24422 // look for psign/blend
24423 if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24424 if (!Subtarget->hasSSSE3() ||
24425 (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24426 return SDValue();
24427
24428 // Canonicalize pandn to RHS
24429 if (N0.getOpcode() == X86ISD::ANDNP)
24430 std::swap(N0, N1);
24431 // or (and (m, y), (pandn m, x))
24432 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24433 SDValue Mask = N1.getOperand(0);
24434 SDValue X = N1.getOperand(1);
24435 SDValue Y;
24436 if (N0.getOperand(0) == Mask)
24437 Y = N0.getOperand(1);
24438 if (N0.getOperand(1) == Mask)
24439 Y = N0.getOperand(0);
24440
24441 // Check to see if the mask appeared in both the AND and ANDNP and
24442 if (!Y.getNode())
24443 return SDValue();
24444
24445 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24446 // Look through mask bitcast.
24447 if (Mask.getOpcode() == ISD::BITCAST)
24448 Mask = Mask.getOperand(0);
24449 if (X.getOpcode() == ISD::BITCAST)
24450 X = X.getOperand(0);
24451 if (Y.getOpcode() == ISD::BITCAST)
24452 Y = Y.getOperand(0);
24453
24454 EVT MaskVT = Mask.getValueType();
24455
24456 // Validate that the Mask operand is a vector sra node.
24457 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24458 // there is no psrai.b
24459 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24460 unsigned SraAmt = ~0;
24461 if (Mask.getOpcode() == ISD::SRA) {
24462 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24463 if (auto *AmtConst = AmtBV->getConstantSplatNode())
24464 SraAmt = AmtConst->getZExtValue();
24465 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24466 SDValue SraC = Mask.getOperand(1);
24467 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
24468 }
24469 if ((SraAmt + 1) != EltBits)
24470 return SDValue();
24471
24472 SDLoc DL(N);
24473
24474 // Now we know we at least have a plendvb with the mask val. See if
24475 // we can form a psignb/w/d.
24476 // psign = x.type == y.type == mask.type && y = sub(0, x);
24477 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24478 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24479 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24480 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN") ? static_cast<void> (0) : __assert_fail
("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Unsupported VT for PSIGN\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24481, __PRETTY_FUNCTION__))
24481 "Unsupported VT for PSIGN")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN") ? static_cast<void> (0) : __assert_fail
("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Unsupported VT for PSIGN\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24481, __PRETTY_FUNCTION__))
;
24482 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24483 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24484 }
24485 // PBLENDVB only available on SSE 4.1
24486 if (!Subtarget->hasSSE41())
24487 return SDValue();
24488
24489 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24490
24491 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24492 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24493 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24494 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24495 return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24496 }
24497 }
24498
24499 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24500 return SDValue();
24501
24502 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24503 MachineFunction &MF = DAG.getMachineFunction();
24504 bool OptForSize = MF.getFunction()->getAttributes().
24505 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24506
24507 // SHLD/SHRD instructions have lower register pressure, but on some
24508 // platforms they have higher latency than the equivalent
24509 // series of shifts/or that would otherwise be generated.
24510 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24511 // have higher latencies and we are not optimizing for size.
24512 if (!OptForSize && Subtarget->isSHLDSlow())
24513 return SDValue();
24514
24515 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24516 std::swap(N0, N1);
24517 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24518 return SDValue();
24519 if (!N0.hasOneUse() || !N1.hasOneUse())
24520 return SDValue();
24521
24522 SDValue ShAmt0 = N0.getOperand(1);
24523 if (ShAmt0.getValueType() != MVT::i8)
24524 return SDValue();
24525 SDValue ShAmt1 = N1.getOperand(1);
24526 if (ShAmt1.getValueType() != MVT::i8)
24527 return SDValue();
24528 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24529 ShAmt0 = ShAmt0.getOperand(0);
24530 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24531 ShAmt1 = ShAmt1.getOperand(0);
24532
24533 SDLoc DL(N);
24534 unsigned Opc = X86ISD::SHLD;
24535 SDValue Op0 = N0.getOperand(0);
24536 SDValue Op1 = N1.getOperand(0);
24537 if (ShAmt0.getOpcode() == ISD::SUB) {
24538 Opc = X86ISD::SHRD;
24539 std::swap(Op0, Op1);
24540 std::swap(ShAmt0, ShAmt1);
24541 }
24542
24543 unsigned Bits = VT.getSizeInBits();
24544 if (ShAmt1.getOpcode() == ISD::SUB) {
24545 SDValue Sum = ShAmt1.getOperand(0);
24546 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24547 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24548 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24549 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24550 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24551 return DAG.getNode(Opc, DL, VT,
24552 Op0, Op1,
24553 DAG.getNode(ISD::TRUNCATE, DL,
24554 MVT::i8, ShAmt0));
24555 }
24556 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24557 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24558 if (ShAmt0C &&
24559 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24560 return DAG.getNode(Opc, DL, VT,
24561 N0.getOperand(0), N1.getOperand(0),
24562 DAG.getNode(ISD::TRUNCATE, DL,
24563 MVT::i8, ShAmt0));
24564 }
24565
24566 return SDValue();
24567}
24568
24569// Generate NEG and CMOV for integer abs.
24570static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24571 EVT VT = N->getValueType(0);
24572
24573 // Since X86 does not have CMOV for 8-bit integer, we don't convert
24574 // 8-bit integer abs to NEG and CMOV.
24575 if (VT.isInteger() && VT.getSizeInBits() == 8)
24576 return SDValue();
24577
24578 SDValue N0 = N->getOperand(0);
24579 SDValue N1 = N->getOperand(1);
24580 SDLoc DL(N);
24581
24582 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24583 // and change it to SUB and CMOV.
24584 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24585 N0.getOpcode() == ISD::ADD &&
24586 N0.getOperand(1) == N1 &&
24587 N1.getOpcode() == ISD::SRA &&
24588 N1.getOperand(0) == N0.getOperand(0))
24589 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24590 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24591 // Generate SUB & CMOV.
24592 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24593 DAG.getConstant(0, VT), N0.getOperand(0));
24594
24595 SDValue Ops[] = { N0.getOperand(0), Neg,
24596 DAG.getConstant(X86::COND_GE, MVT::i8),
24597 SDValue(Neg.getNode(), 1) };
24598 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24599 }
24600 return SDValue();
24601}
24602
24603// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24604static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24605 TargetLowering::DAGCombinerInfo &DCI,
24606 const X86Subtarget *Subtarget) {
24607 if (DCI.isBeforeLegalizeOps())
24608 return SDValue();
24609
24610 if (Subtarget->hasCMov()) {
24611 SDValue RV = performIntegerAbsCombine(N, DAG);
24612 if (RV.getNode())
24613 return RV;
24614 }
24615
24616 return SDValue();
24617}
24618
24619/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24620static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24621 TargetLowering::DAGCombinerInfo &DCI,
24622 const X86Subtarget *Subtarget) {
24623 LoadSDNode *Ld = cast<LoadSDNode>(N);
24624 EVT RegVT = Ld->getValueType(0);
24625 EVT MemVT = Ld->getMemoryVT();
24626 SDLoc dl(Ld);
24627 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24628
24629 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24630 // into two 16-byte operations.
24631 ISD::LoadExtType Ext = Ld->getExtensionType();
24632 unsigned Alignment = Ld->getAlignment();
24633 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24634 if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24635 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24636 unsigned NumElems = RegVT.getVectorNumElements();
24637 if (NumElems < 2)
24638 return SDValue();
24639
24640 SDValue Ptr = Ld->getBasePtr();
24641 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24642
24643 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24644 NumElems/2);
24645 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24646 Ld->getPointerInfo(), Ld->isVolatile(),
24647 Ld->isNonTemporal(), Ld->isInvariant(),
24648 Alignment);
24649 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24650 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24651 Ld->getPointerInfo(), Ld->isVolatile(),
24652 Ld->isNonTemporal(), Ld->isInvariant(),
24653 std::min(16U, Alignment));
24654 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24655 Load1.getValue(1),
24656 Load2.getValue(1));
24657
24658 SDValue NewVec = DAG.getUNDEF(RegVT);
24659 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24660 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24661 return DCI.CombineTo(N, NewVec, TF, true);
24662 }
24663
24664 return SDValue();
24665}
24666
24667/// PerformMLOADCombine - Resolve extending loads
24668static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24669 TargetLowering::DAGCombinerInfo &DCI,
24670 const X86Subtarget *Subtarget) {
24671 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24672 if (Mld->getExtensionType() != ISD::SEXTLOAD)
24673 return SDValue();
24674
24675 EVT VT = Mld->getValueType(0);
24676 unsigned NumElems = VT.getVectorNumElements();
24677 EVT LdVT = Mld->getMemoryVT();
24678 SDLoc dl(Mld);
24679
24680 assert(LdVT != VT && "Cannot extend to the same type")((LdVT != VT && "Cannot extend to the same type") ? static_cast
<void> (0) : __assert_fail ("LdVT != VT && \"Cannot extend to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24680, __PRETTY_FUNCTION__))
;
24681 unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24682 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24683 // From, To sizes and ElemCount must be pow of two
24684 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24685, __PRETTY_FUNCTION__))
24685 "Unexpected size for extending masked load")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24685, __PRETTY_FUNCTION__))
;
24686
24687 unsigned SizeRatio = ToSz / FromSz;
24688 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits())((SizeRatio * NumElems * FromSz == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SizeRatio * NumElems * FromSz == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24688, __PRETTY_FUNCTION__))
;
24689
24690 // Create a type on which we perform the shuffle
24691 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24692 LdVT.getScalarType(), NumElems*SizeRatio);
24693 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24693, __PRETTY_FUNCTION__))
;
24694
24695 // Convert Src0 value
24696 SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24697 if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24698 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24699 for (unsigned i = 0; i != NumElems; ++i)
24700 ShuffleVec[i] = i * SizeRatio;
24701
24702 // Can't shuffle using an illegal type.
24703 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24704, __PRETTY_FUNCTION__))
24704 && "WideVecVT should be legal")((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24704, __PRETTY_FUNCTION__))
;
24705 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24706 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24707 }
24708 // Prepare the new mask
24709 SDValue NewMask;
24710 SDValue Mask = Mld->getMask();
24711 if (Mask.getValueType() == VT) {
24712 // Mask and original value have the same type
24713 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24714 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24715 for (unsigned i = 0; i != NumElems; ++i)
24716 ShuffleVec[i] = i * SizeRatio;
24717 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24718 ShuffleVec[i] = NumElems*SizeRatio;
24719 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24720 DAG.getConstant(0, WideVecVT),
24721 &ShuffleVec[0]);
24722 }
24723 else {
24724 assert(Mask.getValueType().getVectorElementType() == MVT::i1)((Mask.getValueType().getVectorElementType() == MVT::i1) ? static_cast
<void> (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24724, __PRETTY_FUNCTION__))
;
24725 unsigned WidenNumElts = NumElems*SizeRatio;
24726 unsigned MaskNumElts = VT.getVectorNumElements();
24727 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
24728 WidenNumElts);
24729
24730 unsigned NumConcat = WidenNumElts / MaskNumElts;
24731 SmallVector<SDValue, 16> Ops(NumConcat);
24732 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24733 Ops[0] = Mask;
24734 for (unsigned i = 1; i != NumConcat; ++i)
24735 Ops[i] = ZeroVal;
24736
24737 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24738 }
24739
24740 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24741 Mld->getBasePtr(), NewMask, WideSrc0,
24742 Mld->getMemoryVT(), Mld->getMemOperand(),
24743 ISD::NON_EXTLOAD);
24744 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24745 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24746
24747}
24748/// PerformMSTORECombine - Resolve truncating stores
24749static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24750 const X86Subtarget *Subtarget) {
24751 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24752 if (!Mst->isTruncatingStore())
24753 return SDValue();
24754
24755 EVT VT = Mst->getValue().getValueType();
24756 unsigned NumElems = VT.getVectorNumElements();
24757 EVT StVT = Mst->getMemoryVT();
24758 SDLoc dl(Mst);
24759
24760 assert(StVT != VT && "Cannot truncate to the same type")((StVT != VT && "Cannot truncate to the same type") ?
static_cast<void> (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24760, __PRETTY_FUNCTION__))
;
24761 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24762 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24763
24764 // From, To sizes and ElemCount must be pow of two
24765 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24766, __PRETTY_FUNCTION__))
24766 "Unexpected size for truncating masked store")((isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24766, __PRETTY_FUNCTION__))
;
24767 // We are going to use the original vector elt for storing.
24768 // Accumulated smaller vector elements must be a multiple of the store size.
24769 assert (((NumElems * FromSz) % ToSz) == 0 &&((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24770, __PRETTY_FUNCTION__))
24770 "Unexpected ratio for truncating masked store")((((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"
) ? static_cast<void> (0) : __assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24770, __PRETTY_FUNCTION__))
;
24771
24772 unsigned SizeRatio = FromSz / ToSz;
24773 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())((SizeRatio * NumElems * ToSz == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24773, __PRETTY_FUNCTION__))
;
24774
24775 // Create a type on which we perform the shuffle
24776 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24777 StVT.getScalarType(), NumElems*SizeRatio);
24778
24779 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24779, __PRETTY_FUNCTION__))
;
24780
24781 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
24782 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24783 for (unsigned i = 0; i != NumElems; ++i)
24784 ShuffleVec[i] = i * SizeRatio;
24785
24786 // Can't shuffle using an illegal type.
24787 assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24788, __PRETTY_FUNCTION__))
24788 && "WideVecVT should be legal")((DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal") ? static_cast<void> (0) :
__assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24788, __PRETTY_FUNCTION__))
;
24789
24790 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24791 DAG.getUNDEF(WideVecVT),
24792 &ShuffleVec[0]);
24793
24794 SDValue NewMask;
24795 SDValue Mask = Mst->getMask();
24796 if (Mask.getValueType() == VT) {
24797 // Mask and original value have the same type
24798 NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24799 for (unsigned i = 0; i != NumElems; ++i)
24800 ShuffleVec[i] = i * SizeRatio;
24801 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24802 ShuffleVec[i] = NumElems*SizeRatio;
24803 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24804 DAG.getConstant(0, WideVecVT),
24805 &ShuffleVec[0]);
24806 }
24807 else {
24808 assert(Mask.getValueType().getVectorElementType() == MVT::i1)((Mask.getValueType().getVectorElementType() == MVT::i1) ? static_cast
<void> (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24808, __PRETTY_FUNCTION__))
;
24809 unsigned WidenNumElts = NumElems*SizeRatio;
24810 unsigned MaskNumElts = VT.getVectorNumElements();
24811 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
24812 WidenNumElts);
24813
24814 unsigned NumConcat = WidenNumElts / MaskNumElts;
24815 SmallVector<SDValue, 16> Ops(NumConcat);
24816 SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24817 Ops[0] = Mask;
24818 for (unsigned i = 1; i != NumConcat; ++i)
24819 Ops[i] = ZeroVal;
24820
24821 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24822 }
24823
24824 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
24825 NewMask, StVT, Mst->getMemOperand(), false);
24826}
24827/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
24828static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
24829 const X86Subtarget *Subtarget) {
24830 StoreSDNode *St = cast<StoreSDNode>(N);
24831 EVT VT = St->getValue().getValueType();
24832 EVT StVT = St->getMemoryVT();
24833 SDLoc dl(St);
24834 SDValue StoredVal = St->getOperand(1);
24835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24836
24837 // If we are saving a concatenation of two XMM registers and 32-byte stores
24838 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
24839 unsigned Alignment = St->getAlignment();
24840 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
24841 if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24842 StVT == VT && !IsAligned) {
24843 unsigned NumElems = VT.getVectorNumElements();
24844 if (NumElems < 2)
24845 return SDValue();
24846
24847 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
24848 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
24849
24850 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
24851 SDValue Ptr0 = St->getBasePtr();
24852 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
24853
24854 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
24855 St->getPointerInfo(), St->isVolatile(),
24856 St->isNonTemporal(), Alignment);
24857 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
24858 St->getPointerInfo(), St->isVolatile(),
24859 St->isNonTemporal(),
24860 std::min(16U, Alignment));
24861 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
24862 }
24863
24864 // Optimize trunc store (of multiple scalars) to shuffle and store.
24865 // First, pack all of the elements in one place. Next, store to memory
24866 // in fewer chunks.
24867 if (St->isTruncatingStore() && VT.isVector()) {
24868 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24869 unsigned NumElems = VT.getVectorNumElements();
24870 assert(StVT != VT && "Cannot truncate to the same type")((StVT != VT && "Cannot truncate to the same type") ?
static_cast<void> (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24870, __PRETTY_FUNCTION__))
;
24871 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24872 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24873
24874 // From, To sizes and ElemCount must be pow of two
24875 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
24876 // We are going to use the original vector elt for storing.
24877 // Accumulated smaller vector elements must be a multiple of the store size.
24878 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
24879
24880 unsigned SizeRatio = FromSz / ToSz;
24881
24882 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())((SizeRatio * NumElems * ToSz == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24882, __PRETTY_FUNCTION__))
;
24883
24884 // Create a type on which we perform the shuffle
24885 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24886 StVT.getScalarType(), NumElems*SizeRatio);
24887
24888 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())((WideVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24888, __PRETTY_FUNCTION__))
;
24889
24890 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
24891 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
24892 for (unsigned i = 0; i != NumElems; ++i)
24893 ShuffleVec[i] = i * SizeRatio;
24894
24895 // Can't shuffle using an illegal type.
24896 if (!TLI.isTypeLegal(WideVecVT))
24897 return SDValue();
24898
24899 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24900 DAG.getUNDEF(WideVecVT),
24901 &ShuffleVec[0]);
24902 // At this point all of the data is stored at the bottom of the
24903 // register. We now need to save it to mem.
24904
24905 // Find the largest store unit
24906 MVT StoreType = MVT::i8;
24907 for (MVT Tp : MVT::integer_valuetypes()) {
24908 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
24909 StoreType = Tp;
24910 }
24911
24912 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
24913 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
24914 (64 <= NumElems * ToSz))
24915 StoreType = MVT::f64;
24916
24917 // Bitcast the original vector into a vector of store-size units
24918 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
24919 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
24920 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits())((StoreVecVT.getSizeInBits() == VT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("StoreVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 24920, __PRETTY_FUNCTION__))
;
24921 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
24922 SmallVector<SDValue, 8> Chains;
24923 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
24924 TLI.getPointerTy());
24925 SDValue Ptr = St->getBasePtr();
24926
24927 // Perform one or more big stores into memory.
24928 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
24929 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24930 StoreType, ShuffWide,
24931 DAG.getIntPtrConstant(i));
24932 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
24933 St->getPointerInfo(), St->isVolatile(),
24934 St->isNonTemporal(), St->getAlignment());
24935 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24936 Chains.push_back(Ch);
24937 }
24938
24939 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
24940 }
24941
24942 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
24943 // the FP state in cases where an emms may be missing.
24944 // A preferable solution to the general problem is to figure out the right
24945 // places to insert EMMS. This qualifies as a quick hack.
24946
24947 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
24948 if (VT.getSizeInBits() != 64)
24949 return SDValue();
24950
24951 const Function *F = DAG.getMachineFunction().getFunction();
24952 bool NoImplicitFloatOps = F->getAttributes().
24953 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
24954 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
24955 && Subtarget->hasSSE2();
24956 if ((VT.isVector() ||
24957 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
24958 isa<LoadSDNode>(St->getValue()) &&
24959 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
24960 St->getChain().hasOneUse() && !St->isVolatile()) {
24961 SDNode* LdVal = St->getValue().getNode();
24962 LoadSDNode *Ld = nullptr;
24963 int TokenFactorIndex = -1;
24964 SmallVector<SDValue, 8> Ops;
24965 SDNode* ChainVal = St->getChain().getNode();
24966 // Must be a store of a load. We currently handle two cases: the load
24967 // is a direct child, and it's under an intervening TokenFactor. It is
24968 // possible to dig deeper under nested TokenFactors.
24969 if (ChainVal == LdVal)
24970 Ld = cast<LoadSDNode>(St->getChain());
24971 else if (St->getValue().hasOneUse() &&
24972 ChainVal->getOpcode() == ISD::TokenFactor) {
24973 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
24974 if (ChainVal->getOperand(i).getNode() == LdVal) {
24975 TokenFactorIndex = i;
24976 Ld = cast<LoadSDNode>(St->getValue());
24977 } else
24978 Ops.push_back(ChainVal->getOperand(i));
24979 }
24980 }
24981
24982 if (!Ld || !ISD::isNormalLoad(Ld))
24983 return SDValue();
24984
24985 // If this is not the MMX case, i.e. we are just turning i64 load/store
24986 // into f64 load/store, avoid the transformation if there are multiple
24987 // uses of the loaded value.
24988 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
24989 return SDValue();
24990
24991 SDLoc LdDL(Ld);
24992 SDLoc StDL(N);
24993 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
24994 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
24995 // pair instead.
24996 if (Subtarget->is64Bit() || F64IsLegal) {
24997 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
24998 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
24999 Ld->getPointerInfo(), Ld->isVolatile(),
25000 Ld->isNonTemporal(), Ld->isInvariant(),
25001 Ld->getAlignment());
25002 SDValue NewChain = NewLd.getValue(1);
25003 if (TokenFactorIndex != -1) {
25004 Ops.push_back(NewChain);
25005 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25006 }
25007 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25008 St->getPointerInfo(),
25009 St->isVolatile(), St->isNonTemporal(),
25010 St->getAlignment());
25011 }
25012
25013 // Otherwise, lower to two pairs of 32-bit loads / stores.
25014 SDValue LoAddr = Ld->getBasePtr();
25015 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25016 DAG.getConstant(4, MVT::i32));
25017
25018 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25019 Ld->getPointerInfo(),
25020 Ld->isVolatile(), Ld->isNonTemporal(),
25021 Ld->isInvariant(), Ld->getAlignment());
25022 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25023 Ld->getPointerInfo().getWithOffset(4),
25024 Ld->isVolatile(), Ld->isNonTemporal(),
25025 Ld->isInvariant(),
25026 MinAlign(Ld->getAlignment(), 4));
25027
25028 SDValue NewChain = LoLd.getValue(1);
25029 if (TokenFactorIndex != -1) {
25030 Ops.push_back(LoLd);
25031 Ops.push_back(HiLd);
25032 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25033 }
25034
25035 LoAddr = St->getBasePtr();
25036 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25037 DAG.getConstant(4, MVT::i32));
25038
25039 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25040 St->getPointerInfo(),
25041 St->isVolatile(), St->isNonTemporal(),
25042 St->getAlignment());
25043 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25044 St->getPointerInfo().getWithOffset(4),
25045 St->isVolatile(),
25046 St->isNonTemporal(),
25047 MinAlign(St->getAlignment(), 4));
25048 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25049 }
25050 return SDValue();
25051}
25052
25053/// Return 'true' if this vector operation is "horizontal"
25054/// and return the operands for the horizontal operation in LHS and RHS. A
25055/// horizontal operation performs the binary operation on successive elements
25056/// of its first operand, then on successive elements of its second operand,
25057/// returning the resulting values in a vector. For example, if
25058/// A = < float a0, float a1, float a2, float a3 >
25059/// and
25060/// B = < float b0, float b1, float b2, float b3 >
25061/// then the result of doing a horizontal operation on A and B is
25062/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25063/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25064/// A horizontal-op B, for some already available A and B, and if so then LHS is
25065/// set to A, RHS to B, and the routine returns 'true'.
25066/// Note that the binary operation should have the property that if one of the
25067/// operands is UNDEF then the result is UNDEF.
25068static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25069 // Look for the following pattern: if
25070 // A = < float a0, float a1, float a2, float a3 >
25071 // B = < float b0, float b1, float b2, float b3 >
25072 // and
25073 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25074 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25075 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25076 // which is A horizontal-op B.
25077
25078 // At least one of the operands should be a vector shuffle.
25079 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25080 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25081 return false;
25082
25083 MVT VT = LHS.getSimpleValueType();
25084
25085 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25086, __PRETTY_FUNCTION__))
25086 "Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25086, __PRETTY_FUNCTION__))
;
25087
25088 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25089 // operate independently on 128-bit lanes.
25090 unsigned NumElts = VT.getVectorNumElements();
25091 unsigned NumLanes = VT.getSizeInBits()/128;
25092 unsigned NumLaneElts = NumElts / NumLanes;
25093 assert((NumLaneElts % 2 == 0) &&(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25094, __PRETTY_FUNCTION__))
25094 "Vector type should have an even number of elements in each lane")(((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25094, __PRETTY_FUNCTION__))
;
25095 unsigned HalfLaneElts = NumLaneElts/2;
25096
25097 // View LHS in the form
25098 // LHS = VECTOR_SHUFFLE A, B, LMask
25099 // If LHS is not a shuffle then pretend it is the shuffle
25100 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25101 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25102 // type VT.
25103 SDValue A, B;
25104 SmallVector<int, 16> LMask(NumElts);
25105 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25106 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25107 A = LHS.getOperand(0);
25108 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25109 B = LHS.getOperand(1);
25110 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25111 std::copy(Mask.begin(), Mask.end(), LMask.begin());
25112 } else {
25113 if (LHS.getOpcode() != ISD::UNDEF)
25114 A = LHS;
25115 for (unsigned i = 0; i != NumElts; ++i)
25116 LMask[i] = i;
25117 }
25118
25119 // Likewise, view RHS in the form
25120 // RHS = VECTOR_SHUFFLE C, D, RMask
25121 SDValue C, D;
25122 SmallVector<int, 16> RMask(NumElts);
25123 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25124 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25125 C = RHS.getOperand(0);
25126 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25127 D = RHS.getOperand(1);
25128 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25129 std::copy(Mask.begin(), Mask.end(), RMask.begin());
25130 } else {
25131 if (RHS.getOpcode() != ISD::UNDEF)
25132 C = RHS;
25133 for (unsigned i = 0; i != NumElts; ++i)
25134 RMask[i] = i;
25135 }
25136
25137 // Check that the shuffles are both shuffling the same vectors.
25138 if (!(A == C && B == D) && !(A == D && B == C))
25139 return false;
25140
25141 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25142 if (!A.getNode() && !B.getNode())
25143 return false;
25144
25145 // If A and B occur in reverse order in RHS, then "swap" them (which means
25146 // rewriting the mask).
25147 if (A != C)
25148 CommuteVectorShuffleMask(RMask, NumElts);
25149
25150 // At this point LHS and RHS are equivalent to
25151 // LHS = VECTOR_SHUFFLE A, B, LMask
25152 // RHS = VECTOR_SHUFFLE A, B, RMask
25153 // Check that the masks correspond to performing a horizontal operation.
25154 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25155 for (unsigned i = 0; i != NumLaneElts; ++i) {
25156 int LIdx = LMask[i+l], RIdx = RMask[i+l];
25157
25158 // Ignore any UNDEF components.
25159 if (LIdx < 0 || RIdx < 0 ||
25160 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25161 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25162 continue;
25163
25164 // Check that successive elements are being operated on. If not, this is
25165 // not a horizontal operation.
25166 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25167 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25168 if (!(LIdx == Index && RIdx == Index + 1) &&
25169 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25170 return false;
25171 }
25172 }
25173
25174 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25175 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25176 return true;
25177}
25178
25179/// Do target-specific dag combines on floating point adds.
25180static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25181 const X86Subtarget *Subtarget) {
25182 EVT VT = N->getValueType(0);
25183 SDValue LHS = N->getOperand(0);
25184 SDValue RHS = N->getOperand(1);
25185
25186 // Try to synthesize horizontal adds from adds of shuffles.
25187 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25188 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25189 isHorizontalBinOp(LHS, RHS, true))
25190 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25191 return SDValue();
25192}
25193
25194/// Do target-specific dag combines on floating point subs.
25195static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25196 const X86Subtarget *Subtarget) {
25197 EVT VT = N->getValueType(0);
25198 SDValue LHS = N->getOperand(0);
25199 SDValue RHS = N->getOperand(1);
25200
25201 // Try to synthesize horizontal subs from subs of shuffles.
25202 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25203 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25204 isHorizontalBinOp(LHS, RHS, false))
25205 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25206 return SDValue();
25207}
25208
25209/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25210static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25211 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25211, __PRETTY_FUNCTION__))
;
25212 // F[X]OR(0.0, x) -> x
25213 // F[X]OR(x, 0.0) -> x
25214 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25215 if (C->getValueAPF().isPosZero())
25216 return N->getOperand(1);
25217 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25218 if (C->getValueAPF().isPosZero())
25219 return N->getOperand(0);
25220 return SDValue();
25221}
25222
25223/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25224static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25225 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25225, __PRETTY_FUNCTION__))
;
25226
25227 // Only perform optimizations if UnsafeMath is used.
25228 if (!DAG.getTarget().Options.UnsafeFPMath)
25229 return SDValue();
25230
25231 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25232 // into FMINC and FMAXC, which are Commutative operations.
25233 unsigned NewOp = 0;
25234 switch (N->getOpcode()) {
25235 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25235)
;
25236 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
25237 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
25238 }
25239
25240 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25241 N->getOperand(0), N->getOperand(1));
25242}
25243
25244/// Do target-specific dag combines on X86ISD::FAND nodes.
25245static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25246 // FAND(0.0, x) -> 0.0
25247 // FAND(x, 0.0) -> 0.0
25248 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25249 if (C->getValueAPF().isPosZero())
25250 return N->getOperand(0);
25251 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25252 if (C->getValueAPF().isPosZero())
25253 return N->getOperand(1);
25254 return SDValue();
25255}
25256
25257/// Do target-specific dag combines on X86ISD::FANDN nodes
25258static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25259 // FANDN(x, 0.0) -> 0.0
25260 // FANDN(0.0, x) -> x
25261 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25262 if (C->getValueAPF().isPosZero())
25263 return N->getOperand(1);
25264 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25265 if (C->getValueAPF().isPosZero())
25266 return N->getOperand(1);
25267 return SDValue();
25268}
25269
25270static SDValue PerformBTCombine(SDNode *N,
25271 SelectionDAG &DAG,
25272 TargetLowering::DAGCombinerInfo &DCI) {
25273 // BT ignores high bits in the bit index operand.
25274 SDValue Op1 = N->getOperand(1);
25275 if (Op1.hasOneUse()) {
25276 unsigned BitWidth = Op1.getValueSizeInBits();
25277 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25278 APInt KnownZero, KnownOne;
25279 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25280 !DCI.isBeforeLegalizeOps());
25281 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25282 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25283 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25284 DCI.CommitTargetLoweringOpt(TLO);
25285 }
25286 return SDValue();
25287}
25288
25289static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25290 SDValue Op = N->getOperand(0);
25291 if (Op.getOpcode() == ISD::BITCAST)
25292 Op = Op.getOperand(0);
25293 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25294 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25295 VT.getVectorElementType().getSizeInBits() ==
25296 OpVT.getVectorElementType().getSizeInBits()) {
25297 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25298 }
25299 return SDValue();
25300}
25301
25302static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25303 const X86Subtarget *Subtarget) {
25304 EVT VT = N->getValueType(0);
25305 if (!VT.isVector())
25306 return SDValue();
25307
25308 SDValue N0 = N->getOperand(0);
25309 SDValue N1 = N->getOperand(1);
25310 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25311 SDLoc dl(N);
25312
25313 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25314 // both SSE and AVX2 since there is no sign-extended shift right
25315 // operation on a vector with 64-bit elements.
25316 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25317 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25318 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25319 N0.getOpcode() == ISD::SIGN_EXTEND)) {
25320 SDValue N00 = N0.getOperand(0);
25321
25322 // EXTLOAD has a better solution on AVX2,
25323 // it may be replaced with X86ISD::VSEXT node.
25324 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25325 if (!ISD::isNormalLoad(N00.getNode()))
25326 return SDValue();
25327
25328 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25329 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25330 N00, N1);
25331 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25332 }
25333 }
25334 return SDValue();
25335}
25336
25337static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25338 TargetLowering::DAGCombinerInfo &DCI,
25339 const X86Subtarget *Subtarget) {
25340 SDValue N0 = N->getOperand(0);
25341 EVT VT = N->getValueType(0);
25342
25343 // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25344 // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25345 // This exposes the sext to the sdivrem lowering, so that it directly extends
25346 // from AH (which we otherwise need to do contortions to access).
25347 if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25348 N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25349 SDLoc dl(N);
25350 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25351 SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25352 N0.getOperand(0), N0.getOperand(1));
25353 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25354 return R.getValue(1);
25355 }
25356
25357 if (!DCI.isBeforeLegalizeOps())
25358 return SDValue();
25359
25360 if (!Subtarget->hasFp256())
25361 return SDValue();
25362
25363 if (VT.isVector() && VT.getSizeInBits() == 256) {
25364 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25365 if (R.getNode())
25366 return R;
25367 }
25368
25369 return SDValue();
25370}
25371
25372static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25373 const X86Subtarget* Subtarget) {
25374 SDLoc dl(N);
25375 EVT VT = N->getValueType(0);
25376
25377 // Let legalize expand this if it isn't a legal type yet.
25378 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25379 return SDValue();
25380
25381 EVT ScalarVT = VT.getScalarType();
25382 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25383 (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25384 return SDValue();
25385
25386 SDValue A = N->getOperand(0);
25387 SDValue B = N->getOperand(1);
25388 SDValue C = N->getOperand(2);
25389
25390 bool NegA = (A.getOpcode() == ISD::FNEG);
25391 bool NegB = (B.getOpcode() == ISD::FNEG);
25392 bool NegC = (C.getOpcode() == ISD::FNEG);
25393
25394 // Negative multiplication when NegA xor NegB
25395 bool NegMul = (NegA != NegB);
25396 if (NegA)
25397 A = A.getOperand(0);
25398 if (NegB)
25399 B = B.getOperand(0);
25400 if (NegC)
25401 C = C.getOperand(0);
25402
25403 unsigned Opcode;
25404 if (!NegMul)
25405 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25406 else
25407 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25408
25409 return DAG.getNode(Opcode, dl, VT, A, B, C);
25410}
25411
25412static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25413 TargetLowering::DAGCombinerInfo &DCI,
25414 const X86Subtarget *Subtarget) {
25415 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
25416 // (and (i32 x86isd::setcc_carry), 1)
25417 // This eliminates the zext. This transformation is necessary because
25418 // ISD::SETCC is always legalized to i8.
25419 SDLoc dl(N);
25420 SDValue N0 = N->getOperand(0);
25421 EVT VT = N->getValueType(0);
25422
25423 if (N0.getOpcode() == ISD::AND &&
25424 N0.hasOneUse() &&
25425 N0.getOperand(0).hasOneUse()) {
25426 SDValue N00 = N0.getOperand(0);
25427 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25428 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25429 if (!C || C->getZExtValue() != 1)
25430 return SDValue();
25431 return DAG.getNode(ISD::AND, dl, VT,
25432 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25433 N00.getOperand(0), N00.getOperand(1)),
25434 DAG.getConstant(1, VT));
25435 }
25436 }
25437
25438 if (N0.getOpcode() == ISD::TRUNCATE &&
25439 N0.hasOneUse() &&
25440 N0.getOperand(0).hasOneUse()) {
25441 SDValue N00 = N0.getOperand(0);
25442 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25443 return DAG.getNode(ISD::AND, dl, VT,
25444 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25445 N00.getOperand(0), N00.getOperand(1)),
25446 DAG.getConstant(1, VT));
25447 }
25448 }
25449 if (VT.is256BitVector()) {
25450 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25451 if (R.getNode())
25452 return R;
25453 }
25454
25455 // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25456 // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25457 // This exposes the zext to the udivrem lowering, so that it directly extends
25458 // from AH (which we otherwise need to do contortions to access).
25459 if (N0.getOpcode() == ISD::UDIVREM &&
25460 N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25461 (VT == MVT::i32 || VT == MVT::i64)) {
25462 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25463 SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25464 N0.getOperand(0), N0.getOperand(1));
25465 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25466 return R.getValue(1);
25467 }
25468
25469 return SDValue();
25470}
25471
25472// Optimize x == -y --> x+y == 0
25473// x != -y --> x+y != 0
25474static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25475 const X86Subtarget* Subtarget) {
25476 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25477 SDValue LHS = N->getOperand(0);
25478 SDValue RHS = N->getOperand(1);
25479 EVT VT = N->getValueType(0);
25480 SDLoc DL(N);
25481
25482 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25483 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25484 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25485 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25486 LHS.getValueType(), RHS, LHS.getOperand(1));
25487 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25488 addV, DAG.getConstant(0, addV.getValueType()), CC);
25489 }
25490 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25491 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25492 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25493 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25494 RHS.getValueType(), LHS, RHS.getOperand(1));
25495 return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25496 addV, DAG.getConstant(0, addV.getValueType()), CC);
25497 }
25498
25499 if (VT.getScalarType() == MVT::i1) {
25500 bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25501 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25502 bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25503 if (!IsSEXT0 && !IsVZero0)
25504 return SDValue();
25505 bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25506 (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25507 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25508
25509 if (!IsSEXT1 && !IsVZero1)
25510 return SDValue();
25511
25512 if (IsSEXT0 && IsVZero1) {
25513 assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type")((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25513, __PRETTY_FUNCTION__))
;
25514 if (CC == ISD::SETEQ)
25515 return DAG.getNOT(DL, LHS.getOperand(0), VT);
25516 return LHS.getOperand(0);
25517 }
25518 if (IsSEXT1 && IsVZero0) {
25519 assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type")((VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == RHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25519, __PRETTY_FUNCTION__))
;
25520 if (CC == ISD::SETEQ)
25521 return DAG.getNOT(DL, RHS.getOperand(0), VT);
25522 return RHS.getOperand(0);
25523 }
25524 }
25525
25526 return SDValue();
25527}
25528
25529static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25530 const X86Subtarget *Subtarget) {
25531 SDLoc dl(N);
25532 MVT VT = N->getOperand(1)->getSimpleValueType(0);
25533 assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&(((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"X86insertps is only defined for v4x32\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25534, __PRETTY_FUNCTION__))
25534 "X86insertps is only defined for v4x32")(((VT == MVT::v4f32 || VT == MVT::v4i32) && "X86insertps is only defined for v4x32"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v4i32) && \"X86insertps is only defined for v4x32\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25534, __PRETTY_FUNCTION__))
;
25535
25536 SDValue Ld = N->getOperand(1);
25537 if (MayFoldLoad(Ld)) {
25538 // Extract the countS bits from the immediate so we can get the proper
25539 // address when narrowing the vector load to a specific element.
25540 // When the second source op is a memory address, interps doesn't use
25541 // countS and just gets an f32 from that address.
25542 unsigned DestIndex =
25543 cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25544 Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25545 } else
25546 return SDValue();
25547
25548 // Create this as a scalar to vector to match the instruction pattern.
25549 SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25550 // countS bits are ignored when loading from memory on insertps, which
25551 // means we don't need to explicitly set them to 0.
25552 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25553 LoadScalarToVector, N->getOperand(2));
25554}
25555
25556// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25557// as "sbb reg,reg", since it can be extended without zext and produces
25558// an all-ones bit which is more useful than 0/1 in some cases.
25559static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25560 MVT VT) {
25561 if (VT == MVT::i8)
25562 return DAG.getNode(ISD::AND, DL, VT,
25563 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25564 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25565 DAG.getConstant(1, VT));
25566 assert (VT == MVT::i1 && "Unexpected type for SECCC node")((VT == MVT::i1 && "Unexpected type for SECCC node") ?
static_cast<void> (0) : __assert_fail ("VT == MVT::i1 && \"Unexpected type for SECCC node\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25566, __PRETTY_FUNCTION__))
;
25567 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25568 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25569 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25570}
25571
25572// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25573static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25574 TargetLowering::DAGCombinerInfo &DCI,
25575 const X86Subtarget *Subtarget) {
25576 SDLoc DL(N);
25577 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25578 SDValue EFLAGS = N->getOperand(1);
25579
25580 if (CC == X86::COND_A) {
25581 // Try to convert COND_A into COND_B in an attempt to facilitate
25582 // materializing "setb reg".
25583 //
25584 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25585 // cannot take an immediate as its first operand.
25586 //
25587 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25588 EFLAGS.getValueType().isInteger() &&
25589 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25590 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25591 EFLAGS.getNode()->getVTList(),
25592 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25593 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25594 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25595 }
25596 }
25597
25598 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25599 // a zext and produces an all-ones bit which is more useful than 0/1 in some
25600 // cases.
25601 if (CC == X86::COND_B)
25602 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25603
25604 SDValue Flags;
25605
25606 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25607 if (Flags.getNode()) {
25608 SDValue Cond = DAG.getConstant(CC, MVT::i8);
25609 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25610 }
25611
25612 return SDValue();
25613}
25614
25615// Optimize branch condition evaluation.
25616//
25617static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25618 TargetLowering::DAGCombinerInfo &DCI,
25619 const X86Subtarget *Subtarget) {
25620 SDLoc DL(N);
25621 SDValue Chain = N->getOperand(0);
25622 SDValue Dest = N->getOperand(1);
25623 SDValue EFLAGS = N->getOperand(3);
25624 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25625
25626 SDValue Flags;
25627
25628 Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25629 if (Flags.getNode()) {
25630 SDValue Cond = DAG.getConstant(CC, MVT::i8);
25631 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25632 Flags);
25633 }
25634
25635 return SDValue();
25636}
25637
25638static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25639 SelectionDAG &DAG) {
25640 // Take advantage of vector comparisons producing 0 or -1 in each lane to
25641 // optimize away operation when it's from a constant.
25642 //
25643 // The general transformation is:
25644 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25645 // AND(VECTOR_CMP(x,y), constant2)
25646 // constant2 = UNARYOP(constant)
25647
25648 // Early exit if this isn't a vector operation, the operand of the
25649 // unary operation isn't a bitwise AND, or if the sizes of the operations
25650 // aren't the same.
25651 EVT VT = N->getValueType(0);
25652 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25653 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25654 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25655 return SDValue();
25656
25657 // Now check that the other operand of the AND is a constant. We could
25658 // make the transformation for non-constant splats as well, but it's unclear
25659 // that would be a benefit as it would not eliminate any operations, just
25660 // perform one more step in scalar code before moving to the vector unit.
25661 if (BuildVectorSDNode *BV =
25662 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25663 // Bail out if the vector isn't a constant.
25664 if (!BV->isConstant())
25665 return SDValue();
25666
25667 // Everything checks out. Build up the new and improved node.
25668 SDLoc DL(N);
25669 EVT IntVT = BV->getValueType(0);
25670 // Create a new constant of the appropriate type for the transformed
25671 // DAG.
25672 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25673 // The AND node needs bitcasts to/from an integer vector type around it.
25674 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25675 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25676 N->getOperand(0)->getOperand(0), MaskConst);
25677 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25678 return Res;
25679 }
25680
25681 return SDValue();
25682}
25683
25684static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25685 const X86TargetLowering *XTLI) {
25686 // First try to optimize away the conversion entirely when it's
25687 // conditionally from a constant. Vectors only.
25688 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25689 if (Res != SDValue())
25690 return Res;
25691
25692 // Now move on to more general possibilities.
25693 SDValue Op0 = N->getOperand(0);
25694 EVT InVT = Op0->getValueType(0);
25695
25696 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25697 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25698 SDLoc dl(N);
25699 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25700 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25701 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25702 }
25703
25704 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25705 // a 32-bit target where SSE doesn't support i64->FP operations.
25706 if (Op0.getOpcode() == ISD::LOAD) {
25707 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25708 EVT VT = Ld->getValueType(0);
25709 if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25710 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25711 !XTLI->getSubtarget()->is64Bit() &&
25712 VT == MVT::i64) {
25713 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
25714 Ld->getChain(), Op0, DAG);
25715 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25716 return FILDChain;
25717 }
25718 }
25719 return SDValue();
25720}
25721
25722// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25723static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25724 X86TargetLowering::DAGCombinerInfo &DCI) {
25725 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25726 // the result is either zero or one (depending on the input carry bit).
25727 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25728 if (X86::isZeroNode(N->getOperand(0)) &&
25729 X86::isZeroNode(N->getOperand(1)) &&
25730 // We don't have a good way to replace an EFLAGS use, so only do this when
25731 // dead right now.
25732 SDValue(N, 1).use_empty()) {
25733 SDLoc DL(N);
25734 EVT VT = N->getValueType(0);
25735 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25736 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25737 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25738 DAG.getConstant(X86::COND_B,MVT::i8),
25739 N->getOperand(2)),
25740 DAG.getConstant(1, VT));
25741 return DCI.CombineTo(N, Res1, CarryOut);
25742 }
25743
25744 return SDValue();
25745}
25746
25747// fold (add Y, (sete X, 0)) -> adc 0, Y
25748// (add Y, (setne X, 0)) -> sbb -1, Y
25749// (sub (sete X, 0), Y) -> sbb 0, Y
25750// (sub (setne X, 0), Y) -> adc -1, Y
25751static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25752 SDLoc DL(N);
25753
25754 // Look through ZExts.
25755 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25756 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25757 return SDValue();
25758
25759 SDValue SetCC = Ext.getOperand(0);
25760 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25761 return SDValue();
25762
25763 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25764 if (CC != X86::COND_E && CC != X86::COND_NE)
25765 return SDValue();
25766
25767 SDValue Cmp = SetCC.getOperand(1);
25768 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25769 !X86::isZeroNode(Cmp.getOperand(1)) ||
25770 !Cmp.getOperand(0).getValueType().isInteger())
25771 return SDValue();
25772
25773 SDValue CmpOp0 = Cmp.getOperand(0);
25774 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25775 DAG.getConstant(1, CmpOp0.getValueType()));
25776
25777 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25778 if (CC == X86::COND_NE)
25779 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25780 DL, OtherVal.getValueType(), OtherVal,
25781 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
25782 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
25783 DL, OtherVal.getValueType(), OtherVal,
25784 DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
25785}
25786
25787/// PerformADDCombine - Do target-specific dag combines on integer adds.
25788static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
25789 const X86Subtarget *Subtarget) {
25790 EVT VT = N->getValueType(0);
25791 SDValue Op0 = N->getOperand(0);
25792 SDValue Op1 = N->getOperand(1);
25793
25794 // Try to synthesize horizontal adds from adds of shuffles.
25795 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25796 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25797 isHorizontalBinOp(Op0, Op1, true))
25798 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
25799
25800 return OptimizeConditionalInDecrement(N, DAG);
25801}
25802
25803static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
25804 const X86Subtarget *Subtarget) {
25805 SDValue Op0 = N->getOperand(0);
25806 SDValue Op1 = N->getOperand(1);
25807
25808 // X86 can't encode an immediate LHS of a sub. See if we can push the
25809 // negation into a preceding instruction.
25810 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
25811 // If the RHS of the sub is a XOR with one use and a constant, invert the
25812 // immediate. Then add one to the LHS of the sub so we can turn
25813 // X-Y -> X+~Y+1, saving one register.
25814 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
25815 isa<ConstantSDNode>(Op1.getOperand(1))) {
25816 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
25817 EVT VT = Op0.getValueType();
25818 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
25819 Op1.getOperand(0),
25820 DAG.getConstant(~XorC, VT));
25821 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
25822 DAG.getConstant(C->getAPIntValue()+1, VT));
25823 }
25824 }
25825
25826 // Try to synthesize horizontal adds from adds of shuffles.
25827 EVT VT = N->getValueType(0);
25828 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25829 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25830 isHorizontalBinOp(Op0, Op1, true))
25831 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
25832
25833 return OptimizeConditionalInDecrement(N, DAG);
25834}
25835
25836/// performVZEXTCombine - Performs build vector combines
25837static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
25838 TargetLowering::DAGCombinerInfo &DCI,
25839 const X86Subtarget *Subtarget) {
25840 SDLoc DL(N);
25841 MVT VT = N->getSimpleValueType(0);
25842 SDValue Op = N->getOperand(0);
25843 MVT OpVT = Op.getSimpleValueType();
25844 MVT OpEltVT = OpVT.getVectorElementType();
25845 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
25846
25847 // (vzext (bitcast (vzext (x)) -> (vzext x)
25848 SDValue V = Op;
25849 while (V.getOpcode() == ISD::BITCAST)
25850 V = V.getOperand(0);
25851
25852 if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
25853 MVT InnerVT = V.getSimpleValueType();
25854 MVT InnerEltVT = InnerVT.getVectorElementType();
25855
25856 // If the element sizes match exactly, we can just do one larger vzext. This
25857 // is always an exact type match as vzext operates on integer types.
25858 if (OpEltVT == InnerEltVT) {
25859 assert(OpVT == InnerVT && "Types must match for vzext!")((OpVT == InnerVT && "Types must match for vzext!") ?
static_cast<void> (0) : __assert_fail ("OpVT == InnerVT && \"Types must match for vzext!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn227765/lib/Target/X86/X86ISelLowering.cpp"
, 25859, __PRETTY_FUNCTION__))
;
25860 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
25861 }
25862
25863 // The only other way we can combine them is if only a single element of the
25864 // inner vzext is used in the input to the outer vzext.
25865 if (InnerEltVT.getSizeInBits() < InputBits)
25866 return SDValue();
25867
25868 // In this case, the inner vzext is completely dead because we're going to
25869 // only look at bits inside of the low element. Just do the outer vzext on
25870 // a bitcast of the input to the inner.
25871 return DAG.getNode(X86ISD::VZEXT, DL, VT,
25872 DAG.getNode(ISD::BITCAST, DL, OpVT, V));
25873 }
25874
25875 // Check if we can bypass extracting and re-inserting an element of an input
25876 // vector. Essentialy:
25877 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
25878 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
25879 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25880 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
25881 SDValue ExtractedV = V.getOperand(0);
25882 SDValue OrigV = ExtractedV.getOperand(0);
25883 if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
25884 if (ExtractIdx->getZExtValue() == 0) {
25885 MVT OrigVT = OrigV.getSimpleValueType();
25886 // Extract a subvector if necessary...
25887 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
25888 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
25889 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
25890 OrigVT.getVectorNumElements() / Ratio);
25891 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
25892 DAG.getIntPtrConstant(0));
25893 }
25894 Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
25895 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
25896 }
25897 }
25898
25899 return SDValue();
25900}
25901
25902SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
25903 DAGCombinerInfo &DCI) const {
25904 SelectionDAG &DAG = DCI.DAG;
25905 switch (N->getOpcode()) {
25906 default: break;
25907 case ISD::EXTRACT_VECTOR_ELT:
25908 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
25909 case ISD::VSELECT:
25910 case ISD::SELECT:
25911 case X86ISD::SHRUNKBLEND:
25912 return PerformSELECTCombine(N, DAG, DCI, Subtarget);
25913 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
25914 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
25915 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
25916 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
25917 case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
25918 case ISD::SHL:
25919 case ISD::SRA:
25920 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget);
25921 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
25922 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
25923 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
25924 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
25925 case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
25926 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
25927 case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
25928 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
25929 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
25930 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
25931 case X86ISD::FXOR:
25932 case X86ISD::FOR: return PerformFORCombine(N, DAG);
25933 case X86ISD::FMIN:
25934 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
25935 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
25936 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
25937 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
25938 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
25939 case ISD::ANY_EXTEND:
25940 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
25941 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
25942 case ISD::SIGN_EXTEND_INREG:
25943 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
25944 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
25945 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
25946 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
25947 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
25948 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
25949 case X86ISD::SHUFP: // Handle all target specific shuffles
25950 case X86ISD::PALIGNR:
25951 case X86ISD::UNPCKH:
25952 case X86ISD::UNPCKL:
25953 case X86ISD::MOVHLPS:
25954 case X86ISD::MOVLHPS:
25955 case X86ISD::PSHUFB:
25956 case X86ISD::PSHUFD:
25957 case X86ISD::PSHUFHW:
25958 case X86ISD::PSHUFLW:
25959 case X86ISD::MOVSS:
25960 case X86ISD::MOVSD:
25961 case X86ISD::VPERMILPI:
25962 case X86ISD::VPERM2X128:
25963 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
25964 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
25965 case ISD::INTRINSIC_WO_CHAIN:
25966 return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
25967 case X86ISD::INSERTPS: {
25968 if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
25969 return PerformINSERTPSCombine(N, DAG, Subtarget);
25970 break;
25971 }
25972 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
25973 }
25974
25975 return SDValue();
25976}
25977
25978/// isTypeDesirableForOp - Return true if the target has native support for
25979/// the specified value type and it is 'desirable' to use the type for the
25980/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
25981/// instruction encodings are longer and some i16 instructions are slow.
25982bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
25983 if (!isTypeLegal(VT))
25984 return false;
25985 if (VT != MVT::i16)
25986 return true;
25987
25988 switch (Opc) {
25989 default:
25990 return true;
25991 case ISD::LOAD:
25992 case ISD::SIGN_EXTEND:
25993 case ISD::ZERO_EXTEND:
25994 case ISD::ANY_EXTEND:
25995 case ISD::SHL:
25996 case ISD::SRL:
25997 case ISD::SUB:
25998 case ISD::ADD:
25999 case ISD::MUL:
26000 case ISD::AND:
26001 case ISD::OR:
26002 case ISD::XOR:
26003 return false;
26004 }
26005}
26006
26007/// IsDesirableToPromoteOp - This method query the target whether it is
26008/// beneficial for dag combiner to promote the specified node. If true, it
26009/// should return the desired promotion type by reference.
26010bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26011 EVT VT = Op.getValueType();
26012 if (VT != MVT::i16)
26013 return false;
26014
26015 bool Promote = false;
26016 bool Commute = false;
26017 switch (Op.getOpcode()) {
26018 default: break;
26019 case ISD::LOAD: {
26020 LoadSDNode *LD = cast<LoadSDNode>(Op);
26021 // If the non-extending load has a single use and it's not live out, then it
26022 // might be folded.
26023 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26024 Op.hasOneUse()*/) {
26025 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26026 UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26027 // The only case where we'd want to promote LOAD (rather then it being
26028 // promoted as an operand is when it's only use is liveout.
26029 if (UI->getOpcode() != ISD::CopyToReg)
26030 return false;
26031 }
26032 }
26033 Promote = true;
26034 break;
26035 }
26036 case ISD::SIGN_EXTEND:
26037 case ISD::ZERO_EXTEND:
26038 case ISD::ANY_EXTEND:
26039 Promote = true;
26040 break;
26041 case ISD::SHL:
26042 case ISD::SRL: {
26043 SDValue N0 = Op.getOperand(0);
26044 // Look out for (store (shl (load), x)).
26045 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26046 return false;
26047 Promote = true;
26048 break;
26049 }
26050 case ISD::ADD:
26051 case ISD::MUL:
26052 case ISD::AND:
26053 case ISD::OR:
26054 case ISD::XOR:
26055 Commute = true;
26056 // fallthrough
26057 case ISD::SUB: {
26058 SDValue N0 = Op.getOperand(0);
26059 SDValue N1 = Op.getOperand(1);
26060 if (!Commute && MayFoldLoad(N1))
26061 return false;
26062 // Avoid disabling potential load folding opportunities.
26063 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26064 return false;
26065 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26066 return false;
26067 Promote = true;
26068 }
26069 }
26070
26071 PVT = MVT::i32;
26072 return Promote;
26073}
26074
26075//===----------------------------------------------------------------------===//
26076// X86 Inline Assembly Support
26077//===----------------------------------------------------------------------===//
26078
26079namespace {
26080 // Helper to match a string separated by whitespace.
26081 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26082 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26083
26084 for (unsigned i = 0, e = args.size(); i != e; ++i) {
26085 StringRef piece(*args[i]);
26086 if (!s.startswith(piece)) // Check if the piece matches.
26087 return false;
26088
26089 s = s.substr(piece.size());
26090 StringRef::size_type pos = s.find_first_not_of(" \t");
26091 if (pos == 0) // We matched a prefix.
26092 return false;
26093
26094 s = s.substr(pos);
26095 }
26096
26097 return s.empty();
26098 }
26099 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26100}
26101
26102static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26103
26104 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26105 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26106 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26107 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26108
26109 if (AsmPieces.size() == 3)
26110 return true;
26111 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26112 return true;
26113 }
26114 }
26115 return false;
26116}
26117
26118bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26119 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26120
26121 std::string AsmStr = IA->getAsmString();
26122
26123 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26124 if (!Ty || Ty->getBitWidth() % 16 != 0)
26125 return false;
26126
26127 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26128 SmallVector<StringRef, 4> AsmPieces;
26129 SplitString(AsmStr, AsmPieces, ";\n");
26130
26131 switch (AsmPieces.size()) {
26132 default: return false;
26133 case 1:
26134 // FIXME: this should verify that we are targeting a 486 or better. If not,
26135 // we will turn this bswap into something that will be lowered to logical
26136 // ops instead of emitting the bswap asm. For now, we don't support 486 or
26137 // lower so don't worry about this.
26138 // bswap $0
26139 if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26140 matchAsm(AsmPieces[0], "bswapl", "$0") ||
26141 matchAsm(AsmPieces[0], "bswapq", "$0") ||
26142 matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26143 matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26144 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26145 // No need to check constraints, nothing other than the equivalent of
26146 // "=r,0" would be valid here.
26147 return IntrinsicLowering::LowerToByteSwap(CI);
26148 }
26149
26150 // rorw $$8, ${0:w} --> llvm.bswap.i16
26151 if (CI->getType()->isIntegerTy(16) &&
26152 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26153 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26154 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26155 AsmPieces.clear();
26156 const std::string &ConstraintsStr = IA->getConstraintString();
26157 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26158 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26159 if (clobbersFlagRegisters(AsmPieces))
26160 return IntrinsicLowering::LowerToByteSwap(CI);
26161 }
26162 break;
26163 case 3:
26164 if (CI->getType()->isIntegerTy(32) &&
26165 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26166 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26167 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26168 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26169 AsmPieces.clear();
26170 const std::string &ConstraintsStr = IA->getConstraintString();
26171 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26172 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26173 if (clobbersFlagRegisters(AsmPieces))
26174 return IntrinsicLowering::LowerToByteSwap(CI);
26175 }
26176
26177 if (CI->getType()->isIntegerTy(64)) {
26178 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26179 if (Constraints.size() >= 2 &&
26180 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26181 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26182 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
26183 if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26184 matchAsm(AsmPieces[1], "bswap", "%edx") &&
26185 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26186 return IntrinsicLowering::LowerToByteSwap(CI);
26187 }
26188 }
26189 break;
26190 }
26191 return false;
26192}
26193
26194/// getConstraintType - Given a constraint letter, return the type of
26195/// constraint it is for this target.
26196X86TargetLowering::ConstraintType
26197X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26198 if (Constraint.size() == 1) {
26199 switch (Constraint[0]) {
26200 case 'R':
26201 case 'q':
26202 case 'Q':
26203 case 'f':
26204 case 't':
26205 case 'u':
26206 case 'y':
26207 case 'x':
26208 case 'Y':
26209 case 'l':
26210 return C_RegisterClass;
26211 case 'a':
26212 case 'b':
26213 case 'c':
26214 case 'd':
26215 case 'S':
26216 case 'D':
26217 case 'A':
26218 return C_Register;
26219 case 'I':
26220 case 'J':
26221 case 'K':
26222 case 'L':
26223 case 'M':
26224 case 'N':
26225 case 'G':
26226 case 'C':
26227 case 'e':
26228 case 'Z':
26229 return C_Other;
26230 default:
26231 break;
26232 }
26233 }
26234 return TargetLowering::getConstraintType(Constraint);
26235}
26236
26237/// Examine constraint type and operand type and determine a weight value.
26238/// This object must already have been set up with the operand type
26239/// and the current alternative constraint selected.
26240TargetLowering::ConstraintWeight
26241 X86TargetLowering::getSingleConstraintMatchWeight(
26242 AsmOperandInfo &info, const char *constraint) const {
26243 ConstraintWeight weight = CW_Invalid;
26244 Value *CallOperandVal = info.CallOperandVal;
26245 // If we don't have a value, we can't do a match,
26246 // but allow it at the lowest weight.
26247 if (!CallOperandVal)
26248 return CW_Default;
26249 Type *type = CallOperandVal->getType();
26250 // Look at the constraint type.
26251 switch (*constraint) {
26252 default:
26253 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26254 case 'R':
26255 case 'q':
26256 case 'Q':
26257 case 'a':
26258 case 'b':
26259 case 'c':
26260 case 'd':
26261 case 'S':
26262 case 'D':
26263 case 'A':
26264 if (CallOperandVal->getType()->isIntegerTy())
26265 weight = CW_SpecificReg;
26266 break;
26267 case 'f':
26268 case 't':
26269 case 'u':
26270 if (type->isFloatingPointTy())
26271 weight = CW_SpecificReg;
26272 break;
26273 case 'y':
26274 if (type->isX86_MMXTy() && Subtarget->hasMMX())
26275 weight = CW_SpecificReg;
26276 break;
26277 case 'x':
26278 case 'Y':
26279 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26280 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26281 weight = CW_Register;
26282 break;
26283 case 'I':
26284 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26285 if (C->getZExtValue() <= 31)
26286 weight = CW_Constant;
26287 }
26288 break;
26289 case 'J':
26290 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26291 if (C->getZExtValue() <= 63)
26292 weight = CW_Constant;
26293 }
26294 break;
26295 case 'K':
26296 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26297 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26298 weight = CW_Constant;
26299 }
26300 break;
26301 case 'L':
26302 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26303 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26304 weight = CW_Constant;
26305 }
26306 break;
26307 case 'M':
26308 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26309 if (C->getZExtValue() <= 3)
26310 weight = CW_Constant;
26311 }
26312 break;
26313 case 'N':
26314 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26315 if (C->getZExtValue() <= 0xff)
26316 weight = CW_Constant;
26317 }
26318 break;
26319 case 'G':
26320 case 'C':
26321 if (dyn_cast<ConstantFP>(CallOperandVal)) {
26322 weight = CW_Constant;
26323 }
26324 break;
26325 case 'e':
26326 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26327 if ((C->getSExtValue() >= -0x80000000LL) &&
26328 (C->getSExtValue() <= 0x7fffffffLL))
26329 weight = CW_Constant;
26330 }
26331 break;
26332 case 'Z':
26333 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26334 if (C->getZExtValue() <= 0xffffffff)
26335 weight = CW_Constant;
26336 }
26337 break;
26338 }
26339 return weight;
26340}
26341
26342/// LowerXConstraint - try to replace an X constraint, which matches anything,
26343/// with another that has more specific requirements based on the type of the
26344/// corresponding operand.
26345const char *X86TargetLowering::
26346LowerXConstraint(EVT ConstraintVT) const {
26347 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26348 // 'f' like normal targets.
26349 if (ConstraintVT.isFloatingPoint()) {
26350 if (Subtarget->hasSSE2())
26351 return "Y";
26352 if (Subtarget->hasSSE1())
26353 return "x";
26354 }
26355
26356 return TargetLowering::LowerXConstraint(ConstraintVT);
26357}
26358
26359/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26360/// vector. If it is invalid, don't add anything to Ops.
26361void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26362 std::string &Constraint,
26363 std::vector<SDValue>&Ops,
26364 SelectionDAG &DAG) const {
26365 SDValue Result;
26366
26367 // Only support length 1 constraints for now.
26368 if (Constraint.length() > 1) return;
26369
26370 char ConstraintLetter = Constraint[0];
26371 switch (ConstraintLetter) {
26372 default: break;
26373 case 'I':
26374 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26375 if (C->getZExtValue() <= 31) {
26376 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26377 break;
26378 }
26379 }
26380 return;
26381 case 'J':
26382 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26383 if (C->getZExtValue() <= 63) {
26384 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26385 break;
26386 }
26387 }
26388 return;
26389 case 'K':
26390 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26391 if (isInt<8>(C->getSExtValue())) {
26392 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26393 break;
26394 }
26395 }
26396 return;
26397 case 'L':
26398 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26399 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26400 (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26401 Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26402 break;
26403 }
26404 }
26405 return;
26406 case 'M':
26407 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26408 if (C->getZExtValue() <= 3) {
26409 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26410 break;
26411 }
26412 }
26413 return;
26414 case 'N':
26415 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26416 if (C->getZExtValue() <= 255) {
26417 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26418 break;
26419 }
26420 }
26421 return;
26422 case 'O':
26423 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26424 if (C->getZExtValue() <= 127) {
26425 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26426 break;
26427 }
26428 }
26429 return;
26430 case 'e': {
26431 // 32-bit signed value
26432 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26433 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26434 C->getSExtValue())) {
26435 // Widen to 64 bits here to get it sign extended.
26436 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26437 break;
26438 }
26439 // FIXME gcc accepts some relocatable values here too, but only in certain
26440 // memory models; it's complicated.
26441 }
26442 return;
26443 }
26444 case 'Z': {
26445 // 32-bit unsigned value
26446 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26447 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26448 C->getZExtValue())) {
26449 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26450 break;
26451 }
26452 }
26453 // FIXME gcc accepts some relocatable values here too, but only in certain
26454 // memory models; it's complicated.
26455 return;
26456 }
26457 case 'i': {
26458 // Literal immediates are always ok.
26459 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26460 // Widen to 64 bits here to get it sign extended.
26461 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26462 break;
26463 }
26464
26465 // In any sort of PIC mode addresses need to be computed at runtime by
26466 // adding in a register or some sort of table lookup. These can't
26467 // be used as immediates.
26468 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26469 return;
26470
26471 // If we are in non-pic codegen mode, we allow the address of a global (with
26472 // an optional displacement) to be used with 'i'.
26473 GlobalAddressSDNode *GA = nullptr;
26474 int64_t Offset = 0;
26475
26476 // Match either (GA), (GA+C), (GA+C1+C2), etc.
26477 while (1) {
26478 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26479 Offset += GA->getOffset();
26480 break;
26481 } else if (Op.getOpcode() == ISD::ADD) {
26482 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26483 Offset += C->getZExtValue();
26484 Op = Op.getOperand(0);
26485 continue;
26486 }
26487 } else if (Op.getOpcode() == ISD::SUB) {
26488 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26489 Offset += -C->getZExtValue();
26490 Op = Op.getOperand(0);
26491 continue;
26492 }
26493 }
26494
26495 // Otherwise, this isn't something we can handle, reject it.
26496 return;
26497 }
26498
26499 const GlobalValue *GV = GA->getGlobal();
26500 // If we require an extra load to get this address, as in PIC mode, we
26501 // can't accept it.
26502 if (isGlobalStubReference(
26503 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26504 return;
26505
26506 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26507 GA->getValueType(0), Offset);
26508 break;
26509 }
26510 }
26511
26512 if (Result.getNode()) {
26513 Ops.push_back(Result);
26514 return;
26515 }
26516 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26517}
26518
26519std::pair<unsigned, const TargetRegisterClass*>
26520X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26521 MVT VT) const {
26522 // First, see if this is a constraint that directly corresponds to an LLVM
26523 // register class.
26524 if (Constraint.size() == 1) {
26525 // GCC Constraint Letters
26526 switch (Constraint[0]) {
26527 default: break;
26528 // TODO: Slight differences here in allocation order and leaving
26529 // RIP in the class. Do they matter any more here than they do
26530 // in the normal allocation?
26531 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26532 if (Subtarget->is64Bit()) {
26533 if (VT == MVT::i32 || VT == MVT::f32)
26534 return std::make_pair(0U, &X86::GR32RegClass);
26535 if (VT == MVT::i16)
26536 return std::make_pair(0U, &X86::GR16RegClass);
26537 if (VT == MVT::i8 || VT == MVT::i1)
26538 return std::make_pair(0U, &X86::GR8RegClass);
26539 if (VT == MVT::i64 || VT == MVT::f64)
26540 return std::make_pair(0U, &X86::GR64RegClass);
26541 break;
26542 }
26543 // 32-bit fallthrough
26544 case 'Q': // Q_REGS
26545 if (VT == MVT::i32 || VT == MVT::f32)
26546 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26547 if (VT == MVT::i16)
26548 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26549 if (VT == MVT::i8 || VT == MVT::i1)
26550 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26551 if (VT == MVT::i64)
26552 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26553 break;
26554 case 'r': // GENERAL_REGS
26555 case 'l': // INDEX_REGS
26556 if (VT == MVT::i8 || VT == MVT::i1)
26557 return std::make_pair(0U, &X86::GR8RegClass);
26558 if (VT == MVT::i16)
26559 return std::make_pair(0U, &X86::GR16RegClass);
26560 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26561 return std::make_pair(0U, &X86::GR32RegClass);
26562 return std::make_pair(0U, &X86::GR64RegClass);
26563 case 'R': // LEGACY_REGS
26564 if (VT == MVT::i8 || VT == MVT::i1)
26565 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26566 if (VT == MVT::i16)
26567 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26568 if (VT == MVT::i32 || !Subtarget->is64Bit())
26569 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26570 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26571 case 'f': // FP Stack registers.
26572 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26573 // value to the correct fpstack register class.
26574 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26575 return std::make_pair(0U, &X86::RFP32RegClass);
26576 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26577 return std::make_pair(0U, &X86::RFP64RegClass);
26578 return std::make_pair(0U, &X86::RFP80RegClass);
26579 case 'y': // MMX_REGS if MMX allowed.
26580 if (!Subtarget->hasMMX()) break;
26581 return std::make_pair(0U, &X86::VR64RegClass);
26582 case 'Y': // SSE_REGS if SSE2 allowed
26583 if (!Subtarget->hasSSE2()) break;
26584 // FALL THROUGH.
26585 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26586 if (!Subtarget->hasSSE1()) break;
26587
26588 switch (VT.SimpleTy) {
26589 default: break;
26590 // Scalar SSE types.
26591 case MVT::f32:
26592 case MVT::i32:
26593 return std::make_pair(0U, &X86::FR32RegClass);
26594 case MVT::f64:
26595 case MVT::i64:
26596 return std::make_pair(0U, &X86::FR64RegClass);
26597 // Vector types.
26598 case MVT::v16i8:
26599 case MVT::v8i16:
26600 case MVT::v4i32:
26601 case MVT::v2i64:
26602 case MVT::v4f32:
26603 case MVT::v2f64:
26604 return std::make_pair(0U, &X86::VR128RegClass);
26605 // AVX types.
26606 case MVT::v32i8:
26607 case MVT::v16i16:
26608 case MVT::v8i32:
26609 case MVT::v4i64:
26610 case MVT::v8f32:
26611 case MVT::v4f64:
26612 return std::make_pair(0U, &X86::VR256RegClass);
26613 case MVT::v8f64:
26614 case MVT::v16f32:
26615 case MVT::v16i32:
26616 case MVT::v8i64:
26617 return std::make_pair(0U, &X86::VR512RegClass);
26618 }
26619 break;
26620 }
26621 }
26622
26623 // Use the default implementation in TargetLowering to convert the register
26624 // constraint into a member of a register class.
26625 std::pair<unsigned, const TargetRegisterClass*> Res;
26626 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26627
26628 // Not found as a standard register?
26629 if (!Res.second) {
26630 // Map st(0) -> st(7) -> ST0
26631 if (Constraint.size() == 7 && Constraint[0] == '{' &&
26632 tolower(Constraint[1]) == 's' &&
26633 tolower(Constraint[2]) == 't' &&
26634 Constraint[3] == '(' &&
26635 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26636 Constraint[5] == ')' &&
26637 Constraint[6] == '}') {
26638
26639 Res.first = X86::FP0+Constraint[4]-'0';
26640 Res.second = &X86::RFP80RegClass;
26641 return Res;
26642 }
26643
26644 // GCC allows "st(0)" to be called just plain "st".
26645 if (StringRef("{st}").equals_lower(Constraint)) {
26646 Res.first = X86::FP0;
26647 Res.second = &X86::RFP80RegClass;
26648 return Res;
26649 }
26650
26651 // flags -> EFLAGS
26652 if (StringRef("{flags}").equals_lower(Constraint)) {
26653 Res.first = X86::EFLAGS;
26654 Res.second = &X86::CCRRegClass;
26655 return Res;
26656 }
26657
26658 // 'A' means EAX + EDX.
26659 if (Constraint == "A") {
26660 Res.first = X86::EAX;
26661 Res.second = &X86::GR32_ADRegClass;
26662 return Res;
26663 }
26664 return Res;
26665 }
26666
26667 // Otherwise, check to see if this is a register class of the wrong value
26668 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26669 // turn into {ax},{dx}.
26670 if (Res.second->hasType(VT))
26671 return Res; // Correct type already, nothing to do.
26672
26673 // All of the single-register GCC register classes map their values onto
26674 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
26675 // really want an 8-bit or 32-bit register, map to the appropriate register
26676 // class and return the appropriate register.
26677 if (Res.second == &X86::GR16RegClass) {
26678 if (VT == MVT::i8 || VT == MVT::i1) {
26679 unsigned DestReg = 0;
26680 switch (Res.first) {
26681 default: break;
26682 case X86::AX: DestReg = X86::AL; break;
26683 case X86::DX: DestReg = X86::DL; break;
26684 case X86::CX: DestReg = X86::CL; break;
26685 case X86::BX: DestReg = X86::BL; break;
26686 }
26687 if (DestReg) {
26688 Res.first = DestReg;
26689 Res.second = &X86::GR8RegClass;
26690 }
26691 } else if (VT == MVT::i32 || VT == MVT::f32) {
26692 unsigned DestReg = 0;
26693 switch (Res.first) {
26694 default: break;
26695 case X86::AX: DestReg = X86::EAX; break;
26696 case X86::DX: DestReg = X86::EDX; break;
26697 case X86::CX: DestReg = X86::ECX; break;
26698 case X86::BX: DestReg = X86::EBX; break;
26699 case X86::SI: DestReg = X86::ESI; break;
26700 case X86::DI: DestReg = X86::EDI; break;
26701 case X86::BP: DestReg = X86::EBP; break;
26702 case X86::SP: DestReg = X86::ESP; break;
26703 }
26704 if (DestReg) {
26705 Res.first = DestReg;
26706 Res.second = &X86::GR32RegClass;
26707 }
26708 } else if (VT == MVT::i64 || VT == MVT::f64) {
26709 unsigned DestReg = 0;
26710 switch (Res.first) {
26711 default: break;
26712 case X86::AX: DestReg = X86::RAX; break;
26713 case X86::DX: DestReg = X86::RDX; break;
26714 case X86::CX: DestReg = X86::RCX; break;
26715 case X86::BX: DestReg = X86::RBX; break;
26716 case X86::SI: DestReg = X86::RSI; break;
26717 case X86::DI: DestReg = X86::RDI; break;
26718 case X86::BP: DestReg = X86::RBP; break;
26719 case X86::SP: DestReg = X86::RSP; break;
26720 }
26721 if (DestReg) {
26722 Res.first = DestReg;
26723 Res.second = &X86::GR64RegClass;
26724 }
26725 }
26726 } else if (Res.second == &X86::FR32RegClass ||
26727 Res.second == &X86::FR64RegClass ||
26728 Res.second == &X86::VR128RegClass ||
26729 Res.second == &X86::VR256RegClass ||
26730 Res.second == &X86::FR32XRegClass ||
26731 Res.second == &X86::FR64XRegClass ||
26732 Res.second == &X86::VR128XRegClass ||
26733 Res.second == &X86::VR256XRegClass ||
26734 Res.second == &X86::VR512RegClass) {
26735 // Handle references to XMM physical registers that got mapped into the
26736 // wrong class. This can happen with constraints like {xmm0} where the
26737 // target independent register mapper will just pick the first match it can
26738 // find, ignoring the required type.
26739
26740 if (VT == MVT::f32 || VT == MVT::i32)
26741 Res.second = &X86::FR32RegClass;
26742 else if (VT == MVT::f64 || VT == MVT::i64)
26743 Res.second = &X86::FR64RegClass;
26744 else if (X86::VR128RegClass.hasType(VT))
26745 Res.second = &X86::VR128RegClass;
26746 else if (X86::VR256RegClass.hasType(VT))
26747 Res.second = &X86::VR256RegClass;
26748 else if (X86::VR512RegClass.hasType(VT))
26749 Res.second = &X86::VR512RegClass;
26750 }
26751
26752 return Res;
26753}
26754
26755int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26756 Type *Ty) const {
26757 // Scaling factors are not free at all.
26758 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26759 // will take 2 allocations in the out of order engine instead of 1
26760 // for plain addressing mode, i.e. inst (reg1).
26761 // E.g.,
26762 // vaddps (%rsi,%drx), %ymm0, %ymm1
26763 // Requires two allocations (one for the load, one for the computation)
26764 // whereas:
26765 // vaddps (%rsi), %ymm0, %ymm1
26766 // Requires just 1 allocation, i.e., freeing allocations for other operations
26767 // and having less micro operations to execute.
26768 //
26769 // For some X86 architectures, this is even worse because for instance for
26770 // stores, the complex addressing mode forces the instruction to use the
26771 // "load" ports instead of the dedicated "store" port.
26772 // E.g., on Haswell:
26773 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26774 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26775 if (isLegalAddressingMode(AM, Ty))
26776 // Scale represents reg2 * scale, thus account for 1
26777 // as soon as we use a second register.
26778 return AM.Scale != 0;
26779 return -1;
26780}
26781
26782bool X86TargetLowering::isTargetFTOL() const {
26783 return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
26784}